diff --git a/.github/workflows/ax.yml b/.github/workflows/ax.yml
index 0b8c99463f..a37c972f0a 100644
--- a/.github/workflows/ax.yml
+++ b/.github/workflows/ax.yml
@@ -74,6 +74,10 @@ jobs:
           - { image: '2021-clang10', cxx: 'g++',     build: 'Release', cmake: '-DDISABLE_DEPENDENCY_VERSION_CHECKS=ON' }
       fail-fast: false
     steps:
+      - name: Enable Node 16
+        if: contains(matrix.config.image, '2021') || contains(matrix.config.image, '2022')
+        run: |
+          echo "ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION=true" >> $GITHUB_ENV
       - uses: actions/checkout@v3
       - name: pybind11
         #if: contains(matrix.config.image, '2023') == false
@@ -123,14 +127,16 @@ jobs:
       matrix:
         config:
           #@note llvm10 never got its own brew formula...
-          - { runner: 'macos-latest', cxx: 'clang++', build: 'Release', llvm: '11' }
-          - { runner: 'macos-latest', cxx: 'clang++', build: 'Release', llvm: '12' }
-          - { runner: 'macos-latest', cxx: 'clang++', build: 'Release', llvm: '13' }
+          # Last macos runner befor M1 (macos-14)
+          - { runner: 'macos-13', cxx: 'clang++', build: 'Release', llvm: '12' }
+          - { runner: 'macos-13', cxx: 'clang++', build: 'Release', llvm: '13' }
       fail-fast: false
     steps:
       - uses: actions/checkout@v3
       - name: install_deps
-        run: ./ci/install_macos.sh ${{ matrix.config.llvm }}
+        run: |
+          ./ci/install_macos.sh ${{ matrix.config.llvm }}
+          ./ci/install_tbb_macos.sh
       - name: build
         run: >
           ./ci/build.sh -v
@@ -139,7 +145,8 @@ jobs:
           --cargs=\"
           -DOPENVDB_AX_TEST_CMD_DOWNLOADS=ON
           -DUSE_EXPLICIT_INSTANTIATION=OFF
-          -DLLVM_DIR=/usr/local/opt/llvm@${{ matrix.config.llvm }}/lib/cmake/llvm
+          -DCMAKE_INSTALL_PREFIX=${{ github.workspace }}/install
+          -DLLVM_DIR=/opt/homebrew/opt/llvm@${{ matrix.config.llvm }}/lib/cmake/llvm
           \"
       - name: test
         run: cd build && ctest -V
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index c9b36595c9..3a4dd2c621 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -86,6 +86,11 @@ jobs:
           - { cxx: g++,     image: '2022-clang11',   abi: '9',  build: 'Release', cmake: '-DDISABLE_DEPENDENCY_VERSION_CHECKS=ON' }
       fail-fast: false
     steps:
+    - name: Enable Node 16
+      # Solution taken from https://github.blog/changelog/2024-03-07-github-actions-all-actions-will-run-on-node20-instead-of-node16-by-default
+      if: contains(matrix.config.image, '2022')
+      run: |
+        echo "ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION=true" >> $GITHUB_ENV
     - uses: actions/checkout@v3
     - name: pybind11
       #if: contains(matrix.config.image, '2023') == false
@@ -125,29 +130,16 @@ jobs:
       run: ccache --evict-older-than 1d
 
   windows:
-    # Windows CI. Tests static and dynamic builds with MT and MD respectively.
+    # Windows CI. Tests a dynamic build with MD.
     if: |
       github.event_name != 'workflow_dispatch' ||
       github.event.inputs.type == 'all' ||
       github.event.inputs.type == 'win'
     runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'windows-2022-8c-32g-300h') || 'windows-latest' }}
-    name: windows-vc:${{ matrix.config.vc }}-type:${{ matrix.config.build }}
+    name: windows
     env:
-      VCPKG_DEFAULT_TRIPLET: ${{ matrix.config.vc }}
+      VCPKG_DEFAULT_TRIPLET: x64-windows
     strategy:
-      matrix:
-        config:
-          # static build of blosc from vcpkg does not build internal sources.
-          # USE_STATIC_DEPENDENCIES is required for IlmBase/OpenEXR defines and
-          # Boost as both shared and static libs are installed.
-          # USE_EXPLICIT_INSTANTIATION is disabled for debug static libraries
-          # due to disk space constraints
-          # @note  Commented out the static debug build due to linker OOM LNK1102
-          - { vc: 'x64-windows-static', components: 'core,bin,view,render,test',       build: 'Release', cmake: '-A x64 -G \"Visual Studio 17 2022\" -DOPENVDB_CORE_SHARED=OFF -DUSE_STATIC_DEPENDENCIES=ON -DBLOSC_USE_EXTERNAL_SOURCES=ON' }
-          #- { vc: 'x64-windows-static', components: 'core,bin,view,render,test', build: 'Debug',   cmake: '-A x64 -G \"Visual Studio 17 2022\" -DOPENVDB_CORE_SHARED=OFF -DUSE_STATIC_DEPENDENCIES=ON -DBLOSC_USE_EXTERNAL_SOURCES=ON -DUSE_EXPLICIT_INSTANTIATION=OFF' }
-          - { vc: 'x64-windows',        components: 'core,bin,view,render,python,test', build: 'Release', cmake: '-A x64 -G \"Visual Studio 17 2022\" -DOPENVDB_CORE_STATIC=OFF' }
-          - { vc: 'x64-windows',        components: 'core,bin,view,render,python,test', build: 'Debug',   cmake: '-A x64 -G \"Visual Studio 17 2022\" -DOPENVDB_CORE_STATIC=OFF' }
-          #- { vc: 'x64-windows',        build: 'Release', cmake: '-G \"MinGW Makefiles\" -DOPENVDB_CORE_STATIC=OFF' }
       fail-fast: false
     steps:
     - uses: actions/checkout@v3
@@ -155,17 +147,17 @@ jobs:
       shell: pwsh
       run: |
         # note: system path must be modified in a previous step to it's use
-        echo "$Env:VCPKG_INSTALLATION_ROOT\installed\${{ matrix.config.vc }}\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-        echo "${{github.workspace}}\build\openvdb\openvdb\${{ matrix.config.build }}" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+        echo "$Env:VCPKG_INSTALLATION_ROOT\installed\x64-windows\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+        echo "${{github.workspace}}\build\openvdb\openvdb\Release" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
     - name: install
       run: ./ci/install_windows.sh
     - name: build
       run: >
         ./ci/build.sh -v
-        --config=${{ matrix.config.build }}
-        --components=${{ matrix.config.components }}
+        --config='Release'
+        --components='core,bin,view,render,python,test'
         --cargs=\'
-        ${{ matrix.config.cmake }}
+        -A x64 -G \"Visual Studio 17 2022\" -DOPENVDB_CORE_STATIC=OFF
         -DMSVC_COMPRESS_PDB=ON
         -DUSE_EXR=ON
         -DUSE_PNG=ON
@@ -176,27 +168,29 @@ jobs:
       # Print the build directy size (monitor if we're hitting runner limits)
       run: du -h build
     - name: test
-      # Always run tests on weekly builds but skip Debug on commits as they take a while.
-      # https://github.community/t/distinct-job-for-each-schedule/17811/2
-      if: contains(github.event.schedule, '0 7 * * 1') || matrix.config.build == 'Release'
-      run: cd build && ctest -V -C ${{ matrix.config.build }}
+      run: cd build && ctest -V -C Release
 
   macos:
     if: |
       github.event_name != 'workflow_dispatch' ||
       github.event.inputs.type == 'all' ||
       github.event.inputs.type == 'mac'
-    runs-on: macos-latest
+    runs-on: macos-13 # Last macos runner befor M1 (macos-14)
     env:
       CXX: clang++
     steps:
     - uses: actions/checkout@v3
     - name: install
-      run: ./ci/install_macos.sh
+      run: |
+        ./ci/install_macos.sh
+        ./ci/install_tbb_macos.sh
     - name: build
       run: >
         ./ci/build.sh -v
         --build-type=Release
         --components=\"core,python,bin,view,render,test\"
+        --cargs=\'
+        -DCMAKE_INSTALL_PREFIX=${{ github.workspace }}/install
+        \'
     - name: test
       run: cd build && ctest -V
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index d3febe0d76..21a9e2bac8 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -42,6 +42,9 @@ jobs:
       # need to re-write the python docs to use sphinx
       image: aswf/ci-openvdb:2022
     steps:
+    - name: Enable Node 16
+      run: |
+        echo "ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION=true" >> $GITHUB_ENV
     - uses: actions/checkout@v3
     - name: install_doxygen
       run: ./ci/install_doxygen.sh 1_8_11
@@ -51,7 +54,12 @@ jobs:
     # - name: install_epydoc
     #   run: pip install epydoc
     - name: install_latex
-      run: yum -y install texlive-latex-bin texlive-dvips texlive-collection-fontsrecommended texlive-collection-latexrecommended
+      run: |
+        # Fix error: Cannot prepare internal mirrorlist: No URLs in mirrorlist. CentOS 8 reached EOL means need to replace the official mirror to vault.centos.org
+        # Comment out mirrorlist and replace #baseurl=...mirror.centos.org with baseurl=...vault.centos.org in files starting with CentOS- in /etc/yum.repos.d folder
+        sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-*
+        sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
+        yum -y install texlive-latex-bin texlive-dvips texlive-collection-fontsrecommended texlive-collection-latexrecommended
     - name: build
       run: >
         ./ci/build.sh -v
diff --git a/.github/workflows/houdini.yml b/.github/workflows/houdini.yml
index 81f9709e7d..f995a49704 100644
--- a/.github/workflows/houdini.yml
+++ b/.github/workflows/houdini.yml
@@ -64,7 +64,7 @@ jobs:
       ${{ needs.checksecret.outputs.HOUDINI_SECRETS == 'true' ||
           github.repository_owner == 'AcademySoftwareFoundation' }}
     runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
-    name: hou:${{ matrix.config.hou }}-vfx:${{ matrix.config.image }}-cxx:${{ matrix.config.cxx }}
+    name: hou:${{ matrix.config.hou_hash }}-vfx:${{ matrix.config.image }}-cxx:${{ matrix.config.cxx }}
     container:
       image: aswf/ci-base:${{ matrix.config.image }}
     env:
@@ -73,12 +73,21 @@ jobs:
     strategy:
       matrix:
         config:
-          #- { cxx: clang++, image: '2022', hou: '20_0', build: 'Release', components: 'core,hou,bin,view,render,python,test,axcore,axbin,axtest' }
-          - { cxx: clang++, image: '2021', hou: '19_5', build: 'Release', components: 'core,hou,bin,view,render,python,test,axcore,axbin,axtest' }
-          #- { cxx: clang++, image: '2022', hou: '20_0', build: 'Debug',   components: 'core,hou' }
-          #- { cxx: g++,     image: '2022', hou: '20_0', build: 'Release', components: 'core,hou' }
+          - { cxx: clang++, image: '2023.0', hou_hash: '20_0-newabi', build: 'Release', components: 'core,hou,bin,view,render,python,test,axcore,axbin,axtest' }
+          - { cxx: clang++, image: '2022',   hou_hash: '20_0-oldabi', build: 'Release', components: 'core,hou' }
+          - { cxx: clang++, image: '2021',   hou_hash: '19_5',        build: 'Release', components: 'core,hou' }
+          - { cxx: clang++, image: '2023.0', hou_hash: '20_0-newabi', build: 'Debug',   components: 'core,hou,bin,view,render,python,test,axcore,axbin,axtest' }
+          - { cxx: g++,     image: '2023.0', hou_hash: '20_0-newabi', build: 'Release', components: 'core,hou,bin,view,render,python,test,axcore,axbin,axtest' }
+          - { cxx: g++,     image: '2022',   hou_hash: '20_0-oldabi', build: 'Release', components: 'core,hou' }
       fail-fast: false
     steps:
+      # See note on this step in the Houdini weekly.yml job
+      # We can remove this when we no longer use < 2023 images
+    - name: Enable Node 16
+      run: |
+        echo "ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION=true" >> $GITHUB_ENV
+    - name: remove zstd
+      run: yum -y remove zstd
     - uses: actions/checkout@v3
     - name: pybind11
       #if: contains(matrix.config.image, '2023') == false
@@ -93,18 +102,17 @@ jobs:
       uses: actions/cache@v3
       with:
         path: /tmp/ccache
-        key: linux-vfx-hou${{ matrix.config.hou }}-${{ matrix.config.image }}-${{ matrix.config.cxx }}-${{ steps.timestamp.outputs.timestamp }}
-        restore-keys: linux-vfx-hou${{ matrix.config.hou }}-${{ matrix.config.image }}-${{ matrix.config.cxx }}-
+        key: linux-vfx-hou${{ matrix.config.hou_hash }}-${{ matrix.config.image }}-${{ matrix.config.cxx }}-${{ steps.timestamp.outputs.timestamp }}
+        restore-keys: linux-vfx-hou${{ matrix.config.hou_hash }}-${{ matrix.config.image }}-${{ matrix.config.cxx }}-
     - name: fetch_houdini
       uses: actions/cache/restore@v3
       with:
         path: hou
-        key: dummy-houdini${{ matrix.config.hou }}-${{ steps.timestamp.outputs.timestamp }}
-        restore-keys: vdb-v5-houdini${{ matrix.config.hou }}-
+        key: dummy-houdini${{ matrix.config.hou_hash }}-${{ steps.timestamp.outputs.timestamp }}
+        restore-keys: vdb-v5-houdini${{ matrix.config.hou_hash }}-
     - name: validate_houdini
       run: test -f "hou/hou.tar.gz"
-      # Make sure the cache is copied, not moved, as the cache action always posts the cache.
-      # Also make sure that the unpacked install is NOT in the root of the OpenVDB checkout
+      # Make sure that the unpacked install is NOT in the root of the OpenVDB checkout
       # otherwise CMake's install RPATHs wil not work correctly.
     - name: install_houdini
       run: |
@@ -113,16 +121,63 @@ jobs:
         cd $HOME/houdini_install && tar -xzf hou.tar.gz && cd -
     - name: build
       run: |
-        export HFS="$HOME/houdini_install/hou"
-        export HDSO="${HFS}/dsolib"
-        export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/lib64:${HDSO}"
-        ./ci/build.sh -v --build-type=Release --components="core,hou" --cargs=\"-DDISABLE_DEPENDENCY_VERSION_CHECKS=ON -DDISABLE_CMAKE_SEARCH_PATHS=ON -DOPENVDB_BUILD_HOUDINI_ABITESTS=OFF -DOPENVDB_HOUDINI_INSTALL_PREFIX=/tmp\ -DTBB_INCLUDEDIR=/usr/local/include -DTBB_LIBRARYDIR=/usr/local/lib\"
+        ./ci/build.sh -v \
+          --build-type=Release \
+          --components="${{ matrix.config.components }}" \
+          --cargs=\" \
+            -DHOUDINI_ROOT=$HOME/houdini_install/hou \
+            -DOPENVDB_BUILD_HOUDINI_ABITESTS=OFF \
+            -DOPENVDB_HOUDINI_INSTALL_PREFIX=/tmp \
+            -DDISABLE_CMAKE_SEARCH_PATHS=ON \
+            -DDISABLE_DEPENDENCY_VERSION_CHECKS=ON \
+            \"
     - name: test
       run: cd build && ctest -V
     # Keep ccache light by stripping out any caches not accessed in the last day
     - name: ccache_clean
       if: matrix.config.build == 'Release'
       run: ccache --evict-older-than 1d
-      # Delete the houdini tarball so that this dummy cache occupies no space
-    - name: delete_hou
-      run: rm -f hou/hou.tar.gz
+
+  macos-houdini:
+    needs: [checksecret]
+    if: >
+      ${{ needs.checksecret.outputs.HOUDINI_SECRETS == 'true' ||
+          github.repository_owner == 'AcademySoftwareFoundation' }}
+    runs-on: macos-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: fetch_houdini
+      uses: actions/cache/restore@v3
+      with:
+        path: hou
+        key: dummy-houdini
+        restore-keys: vdb-v5-houdini-macos-
+    - name: validate_houdini
+      run: test -f "hou/hou.tar.gz"
+      # Make sure that the unpacked install is NOT in the root of the OpenVDB checkout
+      # otherwise CMake's install RPATHs wil not work correctly.
+    - name: install_houdini
+      run: |
+        mkdir $HOME/houdini_install
+        cp hou/hou.tar.gz $HOME/houdini_install/hou.tar.gz
+        cd $HOME/houdini_install && tar -xzf hou.tar.gz && cd -
+    - name: install_deps
+      run: ./ci/install_macos.sh 15
+    - name: build
+      run: |
+        ./ci/build.sh -v \
+          --build-type=Release \
+          --components="core,hou,bin,view,render,python,test,axcore,axbin" \
+          --cargs=\" \
+            -DHOUDINI_ROOT=$HOME/houdini_install/hou \
+            -DOPENVDB_BUILD_HOUDINI_ABITESTS=OFF \
+            -DOPENVDB_HOUDINI_INSTALL_PREFIX=/tmp \
+            -DDISABLE_CMAKE_SEARCH_PATHS=ON \
+            -DDISABLE_DEPENDENCY_VERSION_CHECKS=ON \
+            -DUSE_EXPLICIT_INSTANTIATION=OFF \
+            -DTbb_INCLUDE_DIR=$HOME/houdini_install/hou/Frameworks/Houdini.framework/Versions/Current/Resources/toolkit/include/tbb \
+            -DLLVM_DIR=/opt/homebrew/opt/llvm@15/lib/cmake/llvm \
+            -DCMAKE_INSTALL_PREFIX=${{ github.workspace }}/install \
+            \"
+    - name: test
+      run: cd build && ctest -V
diff --git a/.github/workflows/nanovdb.yml b/.github/workflows/nanovdb.yml
index 35c25010e4..cfdccb0d05 100644
--- a/.github/workflows/nanovdb.yml
+++ b/.github/workflows/nanovdb.yml
@@ -65,9 +65,17 @@ jobs:
           - { cxx: clang++, image: '2022-clang11', build: 'Debug' }
       fail-fast: false
     steps:
+      - name: Enable Node 16
+        if: contains(matrix.config.image, '2022')
+        run: |
+          echo "ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION=true" >> $GITHUB_ENV
       - uses: actions/checkout@v3
       - name: install_cuda_11
         run: |
+          # Fix error: Cannot prepare internal mirrorlist: No URLs in mirrorlist. CentOS 8 reached EOL means need to replace the official mirror to vault.centos.org
+          # Comment out mirrorlist and replace #baseurl=...mirror.centos.org with baseurl=...vault.centos.org in files starting with CentOS- in /etc/yum.repos.d folder
+          sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-*
+          sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
           yum -y install yum-utils
           yum-config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
           echo "Installing cuda toolkit"
@@ -96,43 +104,32 @@ jobs:
       github.event.inputs.type == 'win'
     runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'windows-2022-8c-32g-300h') || 'windows-latest' }}
     env:
-      VCPKG_DEFAULT_TRIPLET: ${{ matrix.config.vc }}
+      VCPKG_DEFAULT_TRIPLET: 'x64-windows'
       visual_studio: "Visual Studio 17 2022"
-      cuda: "11.6.2"
+      cuda: "12.4.0"
     strategy:
-      matrix:
-        config:
-          # static build of blosc from vcpkg does not build internal sources.
-          # USE_STATIC_DEPENDENCIES is required for IlmBase/OpenEXR defines and
-          # Boost as both shared and static libs are installed.
-          - { vc: 'x64-windows-static', build: 'Release', cmake: '-A x64 -G \"Visual Studio 17 2022\" -DOPENVDB_CORE_SHARED=OFF -DUSE_STATIC_DEPENDENCIES=ON -DBLOSC_USE_EXTERNAL_SOURCES=ON -DCMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded' }
-          - { vc: 'x64-windows-static', build: 'Debug',   cmake: '-A x64 -G \"Visual Studio 17 2022\" -DOPENVDB_CORE_SHARED=OFF -DUSE_STATIC_DEPENDENCIES=ON -DBLOSC_USE_EXTERNAL_SOURCES=ON -DCMAKE_MSVC_RUNTIME_LIBRARY=MultiThreadedDebug' }
-          - { vc: 'x64-windows',        build: 'Release', cmake: '-A x64 -G \"Visual Studio 17 2022\" -DOPENVDB_CORE_STATIC=OFF' }
-          - { vc: 'x64-windows',        build: 'Debug',   cmake: '-A x64 -G \"Visual Studio 17 2022\" -DOPENVDB_CORE_STATIC=OFF' }
       fail-fast: false
     steps:
     - uses: actions/checkout@v3
     - name: path
       run: |
         # note: system path must be modified in a previous step to it's use
-        echo "$Env:VCPKG_INSTALLATION_ROOT\installed\${{ matrix.config.vc }}\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-        echo "${{github.workspace}}\build\openvdb\openvdb\${{ matrix.config.build }}" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+        echo "$Env:VCPKG_INSTALLATION_ROOT\installed\x64-windows\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+        echo "${{github.workspace}}\build\openvdb\openvdb\Release" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
     - name: install_cuda
       shell: powershell
       run: .\ci\install_windows_cuda.ps1
     - name: install_deps
       shell: bash
-      run: |
-        vcpkg update
-        vcpkg install zlib tbb gtest blosc boost-iostreams boost-system boost-any boost-uuid boost-interprocess boost-algorithm
+      run: ./ci/install_windows.sh
     - name: build
       shell: bash
       run: >
         ./ci/build.sh -v
-        --config=${{ matrix.config.build }}
+        --config=Release
         --components=core,nano,nanotest,nanoexam,nanobench,nanotool
         --cargs=\'
-        ${{ matrix.config.cmake }}
+        -A x64 -G \"Visual Studio 17 2022\" -DOPENVDB_CORE_STATIC=OFF
         -DMSVC_COMPRESS_PDB=ON
         -DUSE_EXPLICIT_INSTANTIATION=OFF
         -DNANOVDB_USE_CUDA=ON
@@ -155,13 +152,15 @@ jobs:
     strategy:
       matrix:
         config:
-          - { runner: 'macos-11', cxx: 'clang++', build: 'Release' }
-          - { runner: 'macos-11', cxx: 'clang++', build: 'Debug'   }
+          - { runner: 'macos-12', cxx: 'clang++', build: 'Release' }
+          - { runner: 'macos-12', cxx: 'clang++', build: 'Debug'   }
       fail-fast: false
     steps:
       - uses: actions/checkout@v3
       - name: install_deps
-        run: ./ci/install_macos.sh
+        run: |
+          ./ci/install_macos.sh
+          ./ci/install_tbb_macos.sh
       - name: build
         run: >
           ./ci/build.sh -v
@@ -188,6 +187,6 @@ jobs:
           cd nanovdb/nanovdb
           sudo mkdir .build
           cd .build
-          sudo cmake -DUSE_EXPLICIT_INSTANTIATION=OFF -DNANOVDB_BUILD_UNITTESTS=ON ../
+          sudo cmake -DUSE_EXPLICIT_INSTANTIATION=OFF -DNANOVDB_BUILD_UNITTESTS=ON -DNANOVDB_USE_OPENVDB=OFF -DNANOVDB_USE_CUDA=OFF ../
           sudo make -j8 install
           sudo ctest -V
diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml
index 867e440398..5debe30be5 100644
--- a/.github/workflows/weekly.yml
+++ b/.github/workflows/weekly.yml
@@ -34,7 +34,7 @@ jobs:
   # download Houdini and cache it. The secrets are used in download_houdini.py
   checksecret:
     name: Verify Houdini Secrets
-    runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
+    runs-on: ubuntu-latest
     outputs:
       HOUDINI_SECRETS: ${{ steps.check.outputs.HOUDINI_SECRETS }}
     steps:
@@ -46,6 +46,10 @@ jobs:
       - name: Skip Next Jobs
         if: steps.check.outputs.HOUDINI_SECRETS != 'true'
         run: echo "HOUDINI_CLIENT_ID and HOUDINI_SECRET_KEY GitHub Action Secrets needs to be set to install Houdini builds"
+      # Explicitly error on the ASWF repo, we expect this secret to always exist
+      - name: Error ASWF
+        if: steps.check.outputs.HOUDINI_SECRETS != 'true' && github.repository_owner == 'AcademySoftwareFoundation'
+        run: exit 1
 
   # download the latest production version of Houdini X, strip out headers,
   # libraries and binaries required for building OpenVDB and put it into
@@ -53,13 +57,12 @@ jobs:
   linux_houdini:
     needs: [checksecret]
     if: |
-      (needs.checksecret.outputs.HOUDINI_SECRETS == 'true' ||
-      github.repository_owner == 'AcademySoftwareFoundation') &&
+      (needs.checksecret.outputs.HOUDINI_SECRETS == 'true') &&
       (github.event_name != 'workflow_dispatch' ||
-      github.event.inputs.type == 'all' ||
-      github.event.inputs.type == 'houdini')
+       github.event.inputs.type == 'all' ||
+       github.event.inputs.type == 'houdini')
     runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
-    name: linux-houdini:${{ matrix.config.houdini_version }}
+    name: linux-houdini:${{ matrix.config.hou_hash }}
     env:
       CXX: clang++
       HOUDINI_CLIENT_ID: ${{ secrets.HOUDINI_CLIENT_ID }}
@@ -67,12 +70,16 @@ jobs:
     strategy:
       matrix:
         config:
-          - { houdini_version: '19.5', houdini_version_str: '19_5' }
-          #- { houdini_version: '20.0', houdini_version_str: '20_0' }
+          - { houdini_version: '19.5', platform: 'linux_x86_64_gcc9.3',  hou_hash: '19_5' }
+          - { houdini_version: '20.0', platform: 'linux_x86_64_gcc9.3',  hou_hash: '20_0-oldabi' }
+          - { houdini_version: '20.0', platform: 'linux_x86_64_gcc11.2', hou_hash: '20_0-newabi' }
       fail-fast: false
     container:
       image: aswf/ci-base:2023
     steps:
+    - name: Enable Node 16
+      run: |
+        echo "ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION=true" >> $GITHUB_ENV
     - uses: actions/checkout@v3
     # We bumped from the 2021 CI image to 2023 here to fix some OpenSSL issues
     # with the Houdini download script. In so doing we broke some of the caching
@@ -88,7 +95,39 @@ jobs:
       id: timestamp
       run: echo "timestamp=$(date -u +'%Y-%m-%dT%H:%M:%SZ')" >> $GITHUB_OUTPUT
     - name: download_houdini
-      run: ./ci/download_houdini.sh ${{ matrix.config.houdini_version }} ON
+      run: ./ci/download_houdini.sh ${{ matrix.config.houdini_version }} ${{ matrix.config.platform }} --prod
+    - name: install_houdini
+      run: |
+        mkdir $HOME/houdini_install
+        cp hou/hou.tar.gz $HOME/houdini_install/hou.tar.gz
+        cd $HOME/houdini_install && tar -xzf hou.tar.gz && cd -
+    - name: write_houdini_cache
+      uses: actions/cache/save@v3
+      with:
+        path: hou
+        key: vdb-v5-houdini${{ matrix.config.hou_hash }}-${{ steps.timestamp.outputs.timestamp }}
+
+  macos_houdini:
+    needs: [checksecret]
+    if: |
+      (needs.checksecret.outputs.HOUDINI_SECRETS == 'true') &&
+      (github.event_name != 'workflow_dispatch' ||
+       github.event.inputs.type == 'all' ||
+       github.event.inputs.type == 'houdini')
+    # Note that macos-14 (current macos-latest) switches to M1. We could instead test
+    # the arm build here instead of the x86 one.
+    runs-on: macos-latest
+    name: macos-houdini-20
+    env:
+      HOUDINI_CLIENT_ID: ${{ secrets.HOUDINI_CLIENT_ID }}
+      HOUDINI_SECRET_KEY: ${{ secrets.HOUDINI_SECRET_KEY }}
+    steps:
+    - uses: actions/checkout@v3
+    - name: timestamp
+      id: timestamp
+      run: echo "timestamp=$(date -u +'%Y-%m-%dT%H:%M:%SZ')" >> $GITHUB_OUTPUT
+    - name: download_houdini
+      run: ./ci/download_houdini.sh 20.0 macosx_arm64_clang14.0_13 --prod
     - name: install_houdini
       run: |
         mkdir $HOME/houdini_install
@@ -98,7 +137,7 @@ jobs:
       uses: actions/cache/save@v3
       with:
         path: hou
-        key: vdb-v5-houdini${{ matrix.config.houdini_version_str }}-${{ steps.timestamp.outputs.timestamp }}
+        key: vdb-v5-houdini-macos-${{ steps.timestamp.outputs.timestamp }}
 
   #############################################################################
   ########################### Core Library Extras #############################
@@ -135,6 +174,9 @@ jobs:
           - { name: 'conf',  build: 'Release', components: 'core,python,bin,view,render,test,axcore,axtest', cmake: '-DCMAKE_FIND_PACKAGE_PREFER_CONFIG=ON' }
       fail-fast: false
     steps:
+    - name: Enable Node 16
+      run: |
+        echo "ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION=true" >> $GITHUB_ENV
     - uses: actions/checkout@v3
     - name: pybind11
       #if: contains(container.image, '2023') == false
@@ -161,9 +203,10 @@ jobs:
       matrix:
         config:
           - { runson: ubuntu-latest, cxx: g++,     cmake: '' }
-          - { runson: ubuntu-latest, cxx: clang++, cmake: '' }
+          # Disable the clang job for now. See https://github.com/actions/runner-images/issues/8659
+          # - { runson: ubuntu-latest, cxx: clang++, cmake: '' }
           # @todo gcc on macos
-          - { runson: macos-latest,  cxx: '',      cmake: '-D CMAKE_CXX_COMPILER=/usr/local/opt/llvm@15/bin/clang++' }
+          - { runson: macos-latest,  cxx: '',      cmake: '-D CMAKE_CXX_COMPILER=/opt/homebrew/opt/llvm@15/bin/clang++' }
       fail-fast: false
     steps:
       - uses: actions/checkout@v3
@@ -173,6 +216,7 @@ jobs:
             sudo apt-get -q install -y libboost-dev libboost-iostreams-dev libtbb-dev libblosc-dev llvm-dev libgtest-dev libcppunit-dev pybind11-dev
           elif [ "$RUNNER_OS" == "macOS" ]; then
             ./ci/install_macos.sh 15
+            ./ci/install_tbb_macos.sh
           else
             echo "$RUNNER_OS not supported"; exit 1
           fi
@@ -185,6 +229,60 @@ jobs:
       - name: test
         run: cd build && ctest -V
 
+  windows:
+    # Windows CI. Tests static and dynamic builds with MT and MD respectively.
+    if: |
+      github.event_name != 'workflow_dispatch' ||
+      github.event.inputs.type == 'all' ||
+      github.event.inputs.type == 'win'
+    runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'windows-2022-8c-32g-300h') || 'windows-latest' }}
+    name: windows-vc:${{ matrix.config.vc }}-type:${{ matrix.config.build }}
+    env:
+      VCPKG_DEFAULT_TRIPLET: ${{ matrix.config.vc }}
+    strategy:
+      matrix:
+        config:
+          # static build of blosc from vcpkg does not build internal sources.
+          # USE_STATIC_DEPENDENCIES is required for IlmBase/OpenEXR defines and
+          # Boost as both shared and static libs are installed.
+          # USE_EXPLICIT_INSTANTIATION is disabled for debug static libraries
+          # due to disk space constraints
+          - { vc: 'x64-windows-static', components: 'core,bin,view,render,test',       build: 'Release', cmake: '-A x64 -G \"Visual Studio 17 2022\" -DOPENVDB_CORE_SHARED=OFF -DUSE_STATIC_DEPENDENCIES=ON -DBLOSC_USE_EXTERNAL_SOURCES=ON' }
+          - { vc: 'x64-windows',        components: 'core,bin,view,render,python,test', build: 'Release', cmake: '-A x64 -G \"Visual Studio 17 2022\" -DOPENVDB_CORE_STATIC=OFF' }
+          - { vc: 'x64-windows',        components: 'core,bin,view,render,python,test', build: 'Debug',   cmake: '-A x64 -G \"Visual Studio 17 2022\" -DOPENVDB_CORE_STATIC=OFF' }
+      fail-fast: false
+    steps:
+    - uses: actions/checkout@v3
+    - name: path
+      shell: pwsh
+      run: |
+        # note: system path must be modified in a previous step to it's use
+        echo "$Env:VCPKG_INSTALLATION_ROOT\installed\${{ matrix.config.vc }}\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+        echo "${{github.workspace}}\build\openvdb\openvdb\${{ matrix.config.build }}" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+    - name: install
+      run: ./ci/install_windows.sh
+    - name: build
+      run: >
+        ./ci/build.sh -v
+        --config=${{ matrix.config.build }}
+        --components=${{ matrix.config.components }}
+        --cargs=\'
+        ${{ matrix.config.cmake }}
+        -DMSVC_COMPRESS_PDB=ON
+        -DUSE_EXR=ON
+        -DUSE_PNG=ON
+        -DVCPKG_TARGET_TRIPLET=${VCPKG_DEFAULT_TRIPLET}
+        -DCMAKE_TOOLCHAIN_FILE=\"${VCPKG_INSTALLATION_ROOT}\\scripts\\buildsystems\\vcpkg.cmake\"
+        \'
+    - name: size
+      # Print the build directy size (monitor if we're hitting runner limits)
+      run: du -h build
+    - name: test
+      # Always run tests on weekly builds but skip Debug on commits as they take a while.
+      # https://github.community/t/distinct-job-for-each-schedule/17811/2
+      if: contains(github.event.schedule, '0 7 * * 1') || matrix.config.build == 'Release'
+      run: cd build && ctest -V -C ${{ matrix.config.build }}
+
   #############################################################################
   ############################ AX Library Extras ##############################
   #############################################################################
@@ -216,6 +314,10 @@ jobs:
           - { image: '2022-clang11', cxx: 'g++',     build: 'Release', components: 'core', cmake: '' }
       fail-fast: false
     steps:
+      - name: Enable Node 16
+        if: contains(matrix.config.image, '2021') || contains(matrix.config.image, '2022')
+        run: |
+          echo "ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION=true" >> $GITHUB_ENV
       - uses: actions/checkout@v3
       - name: pybind11
         #f: contains(matrix.config.image, '2023') == false
@@ -256,7 +358,7 @@ jobs:
       github.event_name != 'workflow_dispatch' ||
       github.event.inputs.type == 'all' ||
       github.event.inputs.type == 'ax'
-    runs-on: macos-latest
+    runs-on: macos-13
     name: macos-cxx:${{ matrix.config.cxx }}-llvm:${{ matrix.config.llvm }}-${{ matrix.config.build }}
     env:
       CXX: ${{ matrix.config.cxx }}
@@ -270,7 +372,9 @@ jobs:
     steps:
       - uses: actions/checkout@v3
       - name: install_deps
-        run: ./ci/install_macos.sh ${{ matrix.config.llvm }}
+        run: |
+          ./ci/install_macos.sh ${{ matrix.config.llvm }}
+          ./ci/install_tbb_macos.sh
       - name: build
         run: >
           ./ci/build.sh -v
@@ -279,7 +383,8 @@ jobs:
           --cargs=\"
           -DOPENVDB_AX_TEST_CMD_DOWNLOADS=ON
           -DUSE_EXPLICIT_INSTANTIATION=OFF
-          -DLLVM_DIR=/usr/local/opt/llvm@${{ matrix.config.llvm }}/lib/cmake/llvm
+          -DCMAKE_INSTALL_PREFIX=${{ github.workspace }}/install
+          -DLLVM_DIR=/opt/homebrew/opt/llvm@${{ matrix.config.llvm }}/lib/cmake/llvm
           \"
       - name: test
         run: cd build && ctest -V
@@ -315,10 +420,7 @@ jobs:
     - name: llvm
       run: ./ci/install_llvm_windows.sh ${{ matrix.config.crt }}
     - name: install
-      run: |
-        vcpkg update
-        vcpkg install zlib tbb cppunit blosc python3 \
-          boost-iostreams boost-system boost-any boost-uuid boost-interprocess boost-algorithm pybind11
+      run: ./ci/install_windows.sh
     - name: build
       run: >
         ./ci/build.sh -v
@@ -348,6 +450,64 @@ jobs:
   ################################## Blosc ####################################
   #############################################################################
 
+  windows-nanovdb:
+    if: |
+      github.event_name != 'workflow_dispatch' ||
+      github.event.inputs.type == 'all' ||
+      github.event.inputs.type == 'win'
+    runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'windows-2022-8c-32g-300h') || 'windows-latest' }}
+    env:
+      VCPKG_DEFAULT_TRIPLET: ${{ matrix.config.vc }}
+      visual_studio: "Visual Studio 17 2022"
+      cuda: "12.4.0"
+    strategy:
+      matrix:
+        config:
+          # static build of blosc from vcpkg does not build internal sources.
+          # USE_STATIC_DEPENDENCIES is required for IlmBase/OpenEXR defines and
+          # Boost as both shared and static libs are installed.
+          - { vc: 'x64-windows-static', build: 'Release', cmake: '-A x64 -G \"Visual Studio 17 2022\" -DOPENVDB_CORE_SHARED=OFF -DUSE_STATIC_DEPENDENCIES=ON -DBLOSC_USE_EXTERNAL_SOURCES=ON -DCMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded' }
+          - { vc: 'x64-windows-static', build: 'Debug',   cmake: '-A x64 -G \"Visual Studio 17 2022\" -DOPENVDB_CORE_SHARED=OFF -DUSE_STATIC_DEPENDENCIES=ON -DBLOSC_USE_EXTERNAL_SOURCES=ON -DCMAKE_MSVC_RUNTIME_LIBRARY=MultiThreadedDebug' }
+          - { vc: 'x64-windows',        build: 'Release', cmake: '-A x64 -G \"Visual Studio 17 2022\" -DOPENVDB_CORE_STATIC=OFF' }
+          - { vc: 'x64-windows',        build: 'Debug',   cmake: '-A x64 -G \"Visual Studio 17 2022\" -DOPENVDB_CORE_STATIC=OFF' }
+      fail-fast: false
+    steps:
+    - uses: actions/checkout@v3
+    - name: path
+      shell: powershell
+      run: |
+        # note: system path must be modified in a previous step to it's use
+        echo "$Env:VCPKG_INSTALLATION_ROOT\installed\${{ matrix.config.vc }}\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+        echo "${{github.workspace}}\build\openvdb\openvdb\${{ matrix.config.build }}" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+    - name: install_cuda
+      shell: powershell
+      run: .\ci\install_windows_cuda.ps1
+    - name: install_deps
+      shell: bash
+      run: ./ci/install_windows.sh
+    - name: build
+      shell: bash
+      run: >
+        ./ci/build.sh -v
+        --config=${{ matrix.config.build }}
+        --components=core,nano,nanotest,nanoexam,nanobench,nanotool
+        --cargs=\'
+        ${{ matrix.config.cmake }}
+        -DMSVC_COMPRESS_PDB=ON
+        -DUSE_EXPLICIT_INSTANTIATION=OFF
+        -DNANOVDB_USE_CUDA=ON
+        -DNANOVDB_USE_OPENVDB=ON
+        -DVCPKG_TARGET_TRIPLET=${VCPKG_DEFAULT_TRIPLET}
+        -DCMAKE_TOOLCHAIN_FILE=\"${VCPKG_INSTALLATION_ROOT}\\scripts\\buildsystems\\vcpkg.cmake\"
+        \'
+    - name: test
+      shell: bash
+      run: cd build && ctest -V -E ".*cuda.*"
+
+  #############################################################################
+  ################################## Blosc ####################################
+  #############################################################################
+
   linux-blosc:
     if: |
       github.event_name != 'workflow_dispatch' ||
@@ -362,6 +522,9 @@ jobs:
         blosc: ['1.18.0','1.19.0','1.20.0','1.21.0']
       fail-fast: false
     steps:
+    - name: Enable Node 16
+      run: |
+        echo "ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION=true" >> $GITHUB_ENV
     - uses: actions/checkout@v3
     - name: install_blosc
       run: sudo ./ci/install_blosc.sh ${{ matrix.blosc }}
@@ -378,74 +541,72 @@ jobs:
   #############################################################################
 
   linux-abi-checker:
-    # v10.0.0 doesn't exist yet, so can't run this automatically.
     if: |
       github.event_name == 'workflow_dispatch' &&
       (github.event.inputs.type == 'all' ||
        github.event.inputs.type == 'abi')
-    # abi-dumper version verified to work with 20.04/GCC9
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     env:
-      VDB_MAJOR_VERSION: 10
+      # The 'abicheck' build type sets these, but older versions of the library
+      # may not have this build type. See OpenVDBCXX.cmake
+      CXXFLAGS: "-gdwarf-4 -g3 -ggdb -Og"
     steps:
+    - name: Enable Node 16
+      run: |
+        echo "ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION=true" >> $GITHUB_ENV
     - uses: actions/checkout@v3
       with:
         fetch-depth: 0
+        fetch-tags: true
+    # Compute the latest major version - that is used as our baseline
+    # note: For CI forks, make sure you have your tags synced
+    - name: get_major_version
+      run: |
+        LATEST_VERSION_TAG=$(git tag --merged | sort --version-sort | tail -n1)
+        echo "Computed latest VDB tag: ${LATEST_VERSION_TAG}"
+        VDB_MAJOR_VERSION=$(echo ${LATEST_VERSION_TAG} | cut -f1 -d '.' | tr -d -c 0-9)
+        echo "Using major version: ${VDB_MAJOR_VERSION}"
+        echo "VDB_MAJOR_VERSION=${VDB_MAJOR_VERSION}" >> "$GITHUB_ENV"
     - name: install_deps
-      run: sudo apt-get -q install -y libboost-dev libboost-iostreams-dev libtbb-dev libblosc-dev elfutils
-    # abi-compliance-checker and abi-dumper
-    #
-    # @note that abi-dumper is available through apt but at the time of writing this
-    # the version there (1.1) doesn't work correctly and maniftest by creating an
-    # invalid ABI report with missing headers. This then always reports 100% success
-    # rate when used with abi-compliance-checker.
-    # To fix, install both from source and checkout specific commits for both
-    # which have been verified to work on ubuntu 20.04.
-    #
-    # @warning  If you update these, test that they fail when expected!
-    #
-    # Also note that these are far superior to abigail/abidiff tools from redhat
+      run: sudo apt-get -q install -y libboost-iostreams-dev libtbb-dev libblosc-dev elfutils
     - name: install_abi_checker
-      run: |
-        git clone https://github.com/lvc/abi-dumper.git abi-dumper
-        cd abi-dumper && git checkout 16bb467cd7d343dd3a16782b151b56cf15509594 && cd -
-        git clone https://github.com/lvc/abi-compliance-checker abi-compliance-checker
-        cd abi-compliance-checker && git checkout 7c175c45a8ba9ac41b8e47d8ebbab557b623b18e && cd -
-    - name: build_latest
+      run: sudo apt-get -q install -y abi-dumper abi-compliance-checker
+    - name: build_new
       run: >
-        sudo ./ci/build.sh -v
-        --build-dir=build_latest
-        --build-type=Debug
+        ./ci/build.sh -v
+        --build-dir=build_new
+        --build-type=abicheck
         --target=openvdb_shared
         --components=\"core\"
         --cargs=\'-DUSE_EXPLICIT_INSTANTIATION=OFF -DDISABLE_DEPENDENCY_VERSION_CHECKS=ON\'
     - name: checkout_baseline
       run: git checkout v${VDB_MAJOR_VERSION}.0.0
-    - name: build_baseline
+    - name: build_old
       run: >
-        sudo ./ci/build.sh -v
-        --build-type=Debug
+        ./ci/build.sh -v
+        --build-dir=build_old
+        --build-type=abicheck
         --target=openvdb_shared
         --components=\"core\"
         --cargs=\'-DUSE_EXPLICIT_INSTANTIATION=OFF -DDISABLE_DEPENDENCY_VERSION_CHECKS=ON\'
     - name: abi_dump
       run: |
-        abi-dumper/abi-dumper.pl build_latest/openvdb/openvdb/libopenvdb.so -o ABI-1.dump -lver 1
-        abi-dumper/abi-dumper.pl        build/openvdb/openvdb/libopenvdb.so -o ABI-2.dump -lver 2
+        abi-dumper build_new/openvdb/openvdb/libopenvdb.so -o ABI-NEW.dump -lver 1
+        abi-dumper build_old/openvdb/openvdb/libopenvdb.so -o ABI-OLD.dump -lver 2
       # Replace the version namespace in the latest ABI dump with the baseline
       # version we're comparing against. We should probably instead build the
       # latest with the baseline version number but no CMake/defines allow us to
       # do this.
     - name: replace_symbols
-      run: sed -i -E 's/openvdb([^v]*)v'${VDB_MAJOR_VERSION}'_[0-9]/openvdb\1v'${VDB_MAJOR_VERSION}'_0/g' ABI-1.dump
+      run: sed -i -E 's/openvdb([^v]*)v[0-9]*_[0-9]/openvdb\1v'${VDB_MAJOR_VERSION}'_0/g' ABI-NEW.dump
     - name: abi_check
       # -strict treats warnings as errors
       # -extended checks all member data
       # we check everything _not_ in openvdb::**::internal namespace
       run: >
-        abi-compliance-checker/abi-compliance-checker.pl -l OPENVDB
-        -old ABI-2.dump
-        -new ABI-1.dump
+        abi-compliance-checker -l OPENVDB
+        -old ABI-OLD.dump
+        -new ABI-NEW.dump
         -skip-internal-symbols "\d(openvdb.*internal)"
         -skip-internal-types "(openvdb.*internal)::"
         -strict
diff --git a/CHANGES b/CHANGES
index c51d115b50..c0d27a1ce9 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,9 @@
 OpenVDB Version History
 =======================
 
+Version 11.0.1 - In development
+
+
 Version 11.0.0 - November 1, 2023
 
       This version introduces ABI changes relative to older major releases,
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8329dda947..ea389222ca 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -54,7 +54,7 @@ endif()
 
 set(OpenVDB_MAJOR_VERSION 11)
 set(OpenVDB_MINOR_VERSION 0)
-set(OpenVDB_PATCH_VERSION 0)
+set(OpenVDB_PATCH_VERSION 1)
 set(OpenVDB_VERSION "${OpenVDB_MAJOR_VERSION}.${OpenVDB_MINOR_VERSION}.${OpenVDB_PATCH_VERSION}")
 
 project(OpenVDB LANGUAGES CXX VERSION ${OpenVDB_VERSION})
@@ -64,7 +64,7 @@ project(OpenVDB LANGUAGES CXX VERSION ${OpenVDB_VERSION})
 include(CMakeDependentOption)
 include(GNUInstallDirs)
 
-# todo epydoc and pdflatex
+# Components
 option(OPENVDB_BUILD_CORE "Enable the core OpenVDB library. Both static and shared versions are enabled by default" ON)
 option(OPENVDB_BUILD_BINARIES "Enable the vdb binaries. Only vdb_print is enabled by default" ON)
 option(OPENVDB_BUILD_PYTHON_MODULE "Build the pyopenvdb Python module" OFF)
@@ -72,21 +72,34 @@ option(OPENVDB_BUILD_UNITTESTS "Build the OpenVDB unit tests" OFF)
 option(OPENVDB_BUILD_DOCS "Build the OpenVDB documentation" OFF)
 option(OPENVDB_BUILD_HOUDINI_PLUGIN "Build the Houdini plugin" OFF)
 option(OPENVDB_BUILD_HOUDINI_ABITESTS "Build the Houdini ABI tests" OFF)
-
+option(OPENVDB_BUILD_MAYA_PLUGIN "Build the Maya plugin" OFF)
 option(OPENVDB_BUILD_AX "Build the OpenVDB AX library. Turns ON if USE_AX is ON." ${USE_AX})
 option(OPENVDB_BUILD_AX_UNITTESTS "Build the OpenVDB AX unit tests" OFF)
-
 option(OPENVDB_BUILD_NANOVDB "Build the NanoVDB library. Turns ON if USE_NANOVDB is ON." ${USE_NANOVDB})
 
-option(OPENVDB_BUILD_MAYA_PLUGIN "Build the Maya plugin" OFF)
-option(OPENVDB_USE_DELAYED_LOADING "Build the core OpenVDB library with delayed-loading." ON)
+# Global options
 option(OPENVDB_ENABLE_RPATH "Build with RPATH information" ON)
+option(OPENVDB_ENABLE_ASSERTS "Build with asserts in OpenVDB code enabled" OFF)
+option(OPENVDB_USE_DELAYED_LOADING "Build the core OpenVDB library with delayed-loading." ON)
 option(OPENVDB_CXX_STRICT "Enable or disable pre-defined compiler warnings" OFF)
-
 cmake_dependent_option(OPENVDB_INSTALL_CMAKE_MODULES
   "Install the provided OpenVDB CMake modules when building the core library"
   ON "OPENVDB_BUILD_CORE" OFF)
+option(USE_CCACHE "Build using Ccache if found on the path" ON)
+# Disable this on Windows due to linker OOM issues:
+#  LNK1248: image size (XXX) exceeds maximum allowable size (FFFFFFFF)
+#    https://github.com/AcademySoftwareFoundation/openvdb/issues/1718
+#    https://github.com/AcademySoftwareFoundation/openvdb/issues/1624
+cmake_dependent_option(USE_EXPLICIT_INSTANTIATION "Use explicit instantiation for all supported classes
+and methods against a pre-defined list of OpenVDB trees. This makes the core library larger and slower
+to compile, but speeds up the compilation of all dependent code by bypassing the expensive template
+instantation." ON "NOT WIN32" OFF)
+option(OPENVDB_FUTURE_DEPRECATION "Generate messages for upcoming deprecation" ON)
+option(OPENVDB_ENABLE_UNINSTALL "Adds a CMake uninstall target." ON)
+option(USE_COLORED_OUTPUT "Always produce ANSI-colored output (GNU/Clang only)." OFF)
 
+# Component dependency options
+cmake_dependent_option(USE_PKGCONFIG "Use pkg-config to search for dependent libraries." ON "NOT WIN32" OFF)
 option(USE_HOUDINI [=[
 Build the library against a Houdini installation. Turns on automatically if OPENVDB_BUILD_HOUDINI_PLUGIN is enabled.
 When enabled, you do not need to provide dependency locations for TBB, Blosc, Imath and OpenEXR. Boost must be
@@ -121,7 +134,6 @@ option(USE_NANOVDB "Use NanoVDB while building openvdb components." ${OPENVDB_BU
 
 cmake_dependent_option(OPENVDB_DISABLE_BOOST_IMPLICIT_LINKING
   "Disable the implicit linking of Boost libraries on Windows" ON "WIN32" OFF)
-option(USE_CCACHE "Build using Ccache if found on the path" ON)
 option(USE_STATIC_DEPENDENCIES [=[
 Only search for and use static libraries. If OFF the shared versions of requried libraries are prioritised, falling
 back to static libraries. Forcing individual static dependencies can be enabled by setting XXX_USE_STATIC_LIBS
@@ -136,15 +148,6 @@ its default system search routine if it cannot find a dependency with the provid
 paths provided through the Xxx_ROOT, supported XXX_INCLUDEDIR/XXX_LIBRARYDIR variables or the SYSTEM_LIBRARY_PATHS
 list will be searched.]=] OFF)
 
-option(OPENVDB_FUTURE_DEPRECATION "Generate messages for upcoming deprecation" ON)
-option(OPENVDB_ENABLE_UNINSTALL "Adds a CMake uninstall target." ON)
-option(USE_COLORED_OUTPUT "Always produce ANSI-colored output (GNU/Clang only)." OFF)
-cmake_dependent_option(USE_PKGCONFIG "Use pkg-config to search for dependent libraries." ON "NOT WIN32" OFF)
-
-option(USE_EXPLICIT_INSTANTIATION "Use explicit instantiation for all supported classes and methods against a
-pre-defined list of OpenVDB trees. This makes the core library larger and slower to compile, but speeds up
-the compilation of all dependent code by bypassing the expensive template instantation." ON)
-
 set(SYSTEM_LIBRARY_PATHS "" CACHE STRING [=[
 A global list of library paths to additionally use into when searching for dependencies.]=])
 set(MSVC_MP_THREAD_COUNT "" CACHE STRING [=[
@@ -357,27 +360,16 @@ include(cmake/config/OpenVDBCXX.cmake)
 
 #########################################################################
 
-# Configure malloc library. Use Jemalloc for Linux and non-Maya, otherwise Tbbmalloc.
-# * On Mac OSX, linking against Jemalloc < 4.3.0 seg-faults with this error:
-#     malloc: *** malloc_zone_unregister() failed for 0xaddress
-#   Houdini 17.5 and older ships with Jemalloc 3.6.0, so we make Tbbmalloc the default
-#   on Mac OSX (https://github.com/jemalloc/jemalloc/issues/420). Later versions of
-#   Jemalloc are thought to work, but haven't been tested.
-# * On Windows, we follow SideFX's example in using Tbbmalloc due to the challenges
-#   of injecting into the Windows runtime to replace the system allocator.
-
-if((OPENVDB_BUILD_BINARIES OR OPENVDB_BUILD_UNITTESTS) AND CONCURRENT_MALLOC STREQUAL "Auto")
-  if(WIN32 OR APPLE OR USE_MAYA)
+# Configure malloc library. Use Jemalloc if available, tbbmalloc otherwise
+
+if(CONCURRENT_MALLOC STREQUAL "Auto")
+  find_package(Jemalloc QUIET)
+  if(NOT TARGET Jemalloc::jemalloc)
+    message(WARNING "Unable to find Jemalloc, attempting to fall back to TBB malloc.
+      It is recommended to use Jemalloc for optimum performance.")
     set(CONCURRENT_MALLOC "Tbbmalloc")
   else()
-    find_package(Jemalloc QUIET)
-    if(NOT TARGET Jemalloc::jemalloc)
-      message(WARNING "Unable to find Jemalloc, attempting to fall back to TBB malloc.
-        It is recommended to use Jemalloc for optimum performance.")
-      set(CONCURRENT_MALLOC "Tbbmalloc")
-    else()
-      set(CONCURRENT_MALLOC "Jemalloc")
-    endif()
+    set(CONCURRENT_MALLOC "Jemalloc")
   endif()
 endif()
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 1ee6eda8af..e0838e682c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -57,18 +57,20 @@ code and has the right to release it under the
 [Mozilla Public License, version 2.0](LICENSE.md)
 license. See the [TAC documentation on contribution sign off](https://github.com/AcademySoftwareFoundation/tac/blob/master/process/contributing.md#contribution-sign-off) for more information on this requirement.
 
-## Committer
+## Maintainer
 
-The committer role enables the participant to commit code directly to the repository, but also comes with the obligation to be a responsible leader in the community.
+The maintainer role is the equivalent of the "Committer" role in the charter.
 
-### Process for becoming a committer
+This role enables the participant to commit code directly to the repository, but also comes with the obligation to be a responsible leader in the community.
+
+### Process for becoming a maintainer
 
 * Show your experience with the codebase through contributions and engagement on the community channels.
-* Request to become a committer.
-* Have the majority of committers approve you becoming a committer.
+* Request to become a maintainer.
+* Have the majority of maintainers approve you becoming a maintainer.
 * Your name and email is added to the MAINTAINERS.md file for the project.
 
-### Committer responsibilities
+### Maintainer responsibilities
 
 * Monitor email aliases.
 * Monitor Slack (delayed response is perfectly acceptable).
@@ -76,28 +78,29 @@ The committer role enables the participant to commit code directly to the reposi
 * Make sure that ongoing PRs are moving forward at the right pace or close them.
 * Remain an active contributor to the project in general and the code base in particular.
 
-### When does a committer lose committer status?
+### When does a maintainer lose maintainer status?
 
-If a committer is no longer interested or cannot perform the committer duties listed above, they
+If a maintainer is no longer interested or cannot perform the maintainer duties listed above, they
 should volunteer to be moved to emeritus status. In extreme cases this can also occur by a vote of
-the committers per the voting process below.
+the maintainers per the voting process below.
 
 ## Technical Steering Committee (TSC) member
 
 The Technical Steering Committee (TSC) oversees the overall technical direction of OpenVDB, as defined in the [charter](charter.md).
 
-TSC voting members consist of committers that have been nominated by the committers, with a supermajority of voting members required to have a committer elected to be a TSC voting member. TSC voting members term and succession is defined in the [charter](charter.md).
+TSC voting members consist of maintainers that have been nominated by the TSC, with a supermajority of voting members required to have a maintainer elected to be a TSC voting member. TSC voting members term and succession is defined in the [charter](charter.md).
 
 All meetings of the TSC are open to participation by any member of the OpenVDB community. Meeting times are listed in the [ASWF technical community calendar](https://lists.aswf.io/g/tac/calendar).
 
 ## Current TSC members
 
-* Ken Museth, Chair / NVIDIA
-* Andre Pradhana, DreamWorks
+* Ken Museth (Chair), Nvidia
+* Andre Pradhana, Nvidia
 * Jeff Lait, SideFX
-* Nick Avramoussis, WETA
+* Nick Avramoussis, WETA FX
 * Dan Bailey, ILM
-* Richard Jones, DNEG
+* Richard Jones, ILM
+* Gregory Hurst, United Therapeutics
 
 # Release Process
 
diff --git a/MAINTAINERS.md b/MAINTAINERS.md
index df279e587e..1d9752fb24 100644
--- a/MAINTAINERS.md
+++ b/MAINTAINERS.md
@@ -14,3 +14,5 @@ The current OpenVDB maintainers are:
 | Ken Museth       | ken.museth@gmail.com
 | Andre Pradhana   | andre.pradhana@gmail.com
 | Richard Jones    | richardj@ilm.com
+| Gregory Hurst    | tbd
+| Jonathan Swartz  | tbd
diff --git a/RE-LICENSE_NOTE.txt b/RE-LICENSE_NOTE.txt
new file mode 100644
index 0000000000..47c3c73623
--- /dev/null
+++ b/RE-LICENSE_NOTE.txt
@@ -0,0 +1,33 @@
+The following copyright holders agree that all of their contributions
+originally submitted to this project under the Mozilla Public License
+Version 2.0, are hereby relicensed to the Apache License, Version 2.0,
+and are submitted pursuant to the Developer Certificate of Origin, version 1.1:
+
+Ken Museth
+Mehdi Chinoune
+DreamWorks Animation
+Side Effects Software Inc.
+Blender Foundation
+NVIDIA Corporation
+United Therapeutics Corporation
+Digital Domain 3.0, Inc.
+Double Negative
+Ubisoft Entertainment SA
+Adobe Inc.
+Mathieu Malaterre
+Brecht Sanders
+Ignacio Vizzo
+Ben FrantzDale
+Sebastian Gaida
+Alessio Quaglino
+Benedikt Mersch
+David Aguilar
+Brian Sharpe
+Kartik Shrivastava
+Michael Lackner
+Lucas Baraya
+Kuba Roth
+Tom Cnops
+Walt Disney Pictures (Walt Disney Animation Studios)
+The Linux Foundation
+Industrial Light & Magic (ILM)
diff --git a/README.md b/README.md
index f16e03c451..32b95c3b20 100644
--- a/README.md
+++ b/README.md
@@ -35,6 +35,8 @@ Mozilla Foundation.
 The trademarks of any contributor to this project may not be used in
 association with the project without the contributor's express permission.
 
+NOTE: OpenVDB is in the process of changing its license from [Mozilla Public License Version 2.0](https://www.mozilla.org/MPL/2.0/) to [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0)! Please see the file RE-LICENSE_NOTE.txt for more details.
+
 ### Contributing
 
 OpenVDB welcomes contributions to the OpenVDB project. Please refer to the
diff --git a/ci/build.sh b/ci/build.sh
index 7e35797a14..d2e8e96603 100755
--- a/ci/build.sh
+++ b/ci/build.sh
@@ -121,7 +121,15 @@ if HAS_PARM -v || HAS_PARM --verbose; then
     # support older versions of CMake.
     CMAKE_EXTRA+=("-DCMAKE_VERBOSE_MAKEFILE=ON")
 fi
-if HAS_PARM --build-type; then CMAKE_EXTRA+=("-DCMAKE_BUILD_TYPE=${PARMS[--build-type]}"); fi
+if HAS_PARM --build-type; then
+    CMAKE_EXTRA+=("-DCMAKE_BUILD_TYPE=${PARMS[--build-type]}")
+    build_type="${PARMS[--build-type]}"
+    debug="Debug"
+    # Ignore case - if the build-type is Debug, we enable asserts
+    if [ "${build_type,,}" = "${debug,,}" ]; then
+        CMAKE_EXTRA+=("-DOPENVDB_ENABLE_ASSERTS=ON")
+    fi
+fi
 
 # Available components. If a component is not provided it is
 # explicitly set to OFF.
diff --git a/ci/download_houdini.py b/ci/download_houdini.py
old mode 100644
new mode 100755
index 7b65253d76..096e389133
--- a/ci/download_houdini.py
+++ b/ci/download_houdini.py
@@ -1,4 +1,4 @@
-#!/usr/local/bin/python
+#!/usr/bin/env python3
 #
 # Copyright Contributors to the OpenVDB Project
 # SPDX-License-Identifier: MPL-2.0
@@ -19,16 +19,16 @@
 import requests
 import hashlib
 import os
+import argparse
+import copy
 
-# this argument is for the major.minor version of Houdini to download (such as 15.0, 15.5, 16.0)
-version = sys.argv[1]
-only_production = True if sys.argv[2] == 'ON' else False
-user_client_id = os.getenv('HOUDINI_CLIENT_ID')
-user_client_secret_key = os.getenv('HOUDINI_SECRET_KEY')
-
-if not re.match('[0-9][0-9]\.[0-9]$', version):
-    raise IOError('Invalid Houdini Version "%s", expecting in the form "major.minor" such as "16.0"' % version)
-
+# For progress bar printing
+try:
+    from tqdm import tqdm
+    has_tqdm = True
+except:
+    has_tqdm = False
+    pass
 
 # Code that provides convenient Python wrappers to call into the API:
 
@@ -94,7 +94,7 @@ def get_access_token_and_expiry_time(
             ),
         })
     if response.status_code != 200:
-        raise AuthorizationError(response.status_code, reponse.text)
+        raise AuthorizationError(response.status_code, response.text)
 
     response_json = response.json()
     access_token_expiry_time = time.time() - 2 + response_json["expires_in"]
@@ -138,33 +138,83 @@ def __init__(self, http_code, message):
         self.http_code = http_code
 
 
-service = service(
-        access_token_url="https://www.sidefx.com/oauth2/application_token",
-        client_id=user_client_id,
-        client_secret_key=user_client_secret_key,
-        endpoint_url="https://www.sidefx.com/api/",
-    )
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description='Download a Houdini Installation')
+    parser.add_argument('version', type=str, help='Major.Minor version of Houdini to download')
+    parser.add_argument('platform', type=str, help='Platform target')
+    parser.add_argument('--prod', action='store_true', help='Only download production builds')
+    parser.add_argument('--list', action='store_true', help='Just list the available builds and exit.')
+    args = parser.parse_args()
+
+    version = args.version
+    platform = args.platform
+    only_production = args.prod
+
+    user_client_id = os.getenv('HOUDINI_CLIENT_ID')
+    user_client_secret_key = os.getenv('HOUDINI_SECRET_KEY')
+
+    if not re.match('[0-9][0-9]\.[0-9]$', version):
+        raise IOError('Invalid Houdini Version "%s", expecting in the form "major.minor" such as "16.0"' % version)
+
+    service = service(
+            access_token_url="https://www.sidefx.com/oauth2/application_token",
+            client_id=user_client_id,
+            client_secret_key=user_client_secret_key,
+            endpoint_url="https://www.sidefx.com/api/",
+        )
 
-releases_list = service.download.get_daily_builds_list(
-        product='houdini', version=version, platform='linux', only_production=only_production)
+    releases_list = service.download.get_daily_builds_list(
+            product='houdini', version=version, platform=platform, only_production=only_production)
 
-latest_release = service.download.get_daily_build_download(
-        product='houdini', version=version, platform='linux', build=releases_list[0]['build'])
+    print('Available builds:')
+    for rel in releases_list:
+        rel = copy.deepcopy(rel)
+        if 'third_party_libraries' in rel:
+            # Don't print these
+            del rel['third_party_libraries']
+        print(rel)
 
-# Download the file as hou.tar.gz
-local_filename = 'hou.tar.gz'
-response = requests.get(latest_release['download_url'], stream=True)
-if response.status_code == 200:
-    with open(local_filename, 'wb') as f:
+    if args.list:
+        sys.exit(0)
+
+    print('Selecting build: ' + releases_list[0]['build'])
+
+    latest_release = service.download.get_daily_build_download(
+            product='houdini', version=version, platform=platform, build=releases_list[0]['build'])
+    print(latest_release)
+
+    # Can't do this procedurally as latest_release['filename'] can contain
+    # multiple periods and may have multiple trailing extensions...
+    extension = ''
+    if   'linux' in platform: extension = 'tar.gz'
+    elif 'macos' in platform: extension = 'dmg'
+    elif 'win64' in platform: extension = 'exe'
+    assert(extension in latest_release['filename'])
+
+    # Download the file and save it as hou.extension
+    local_filename = 'hou.' + extension
+    print('Writing to "' + local_filename + '"')
+
+    response = requests.get(latest_release['download_url'], stream=True)
+    if response.status_code == 200:
         response.raw.decode_content = True
-        shutil.copyfileobj(response.raw, f)
-else:
-    raise Exception('Error downloading file!')
-
-# Verify the file checksum is matching
-file_hash = hashlib.md5()
-with open(local_filename, 'rb') as f:
-    for chunk in iter(lambda: f.read(4096), b''):
-        file_hash.update(chunk)
-if file_hash.hexdigest() != latest_release['hash']:
-    raise Exception('Checksum does not match!')
+        if has_tqdm:
+            file_size = int(response.headers.get('Content-Length', 0))
+            desc = "(Unknown total file size)" if file_size == 0 else ""
+            with tqdm.wrapattr(response.raw, "read", total=file_size, desc=desc) as r_raw:
+                with open(local_filename, 'wb') as f:
+                    shutil.copyfileobj(r_raw, f)
+        else:
+            with open(local_filename, 'wb') as f:
+                shutil.copyfileobj(response.raw, f)
+    else:
+        raise Exception('Error downloading file!')
+
+    # Verify the file checksum is matching
+    file_hash = hashlib.md5()
+    with open(local_filename, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096), b''):
+            file_hash.update(chunk)
+    if file_hash.hexdigest() != latest_release['hash']:
+        raise Exception('Checksum does not match!')
diff --git a/ci/download_houdini.sh b/ci/download_houdini.sh
index bbc91b6c9b..77aa8e3bd9 100755
--- a/ci/download_houdini.sh
+++ b/ci/download_houdini.sh
@@ -3,53 +3,120 @@
 set -ex
 
 HOUDINI_MAJOR="$1"
-GOLD="$2"
+PLATFORM="$2"
+OTHER_ARGS="$3"
 
 pip install --user requests
+python ci/download_houdini.py $HOUDINI_MAJOR $PLATFORM $OTHER_ARGS
 
-python ci/download_houdini.py $HOUDINI_MAJOR $GOLD
-
-# create dir hierarchy
-mkdir -p hou/bin
-mkdir -p hou/houdini
-mkdir -p hou/toolkit
-mkdir -p hou/dsolib
-
-# unpack hou.tar.gz and cleanup
-tar -xzf hou.tar.gz
-rm -rf hou.tar.gz
-cd houdini*
-tar -xzf houdini.tar.gz
-
-# copy required files into hou dir
-cp houdini_setup* ../hou/.
-
-# report library names
-ls -al dsolib/
-
-# copy required libraries
-cp -r toolkit/cmake ../hou/toolkit/.
-cp -r toolkit/include ../hou/toolkit/.
-cp -r dsolib/libHoudini* ../hou/dsolib/.
-cp -r dsolib/libopenvdb_sesi* ../hou/dsolib/.
-cp -r dsolib/libblosc* ../hou/dsolib/.
-cp -r dsolib/libhboost* ../hou/dsolib/.
-cp -r dsolib/libz* ../hou/dsolib/.
-cp -r dsolib/libbz2* ../hou/dsolib/.
-cp -r dsolib/libtbb* ../hou/dsolib/.
-cp -r dsolib/libjemalloc* ../hou/dsolib/.
-cp -r dsolib/liblzma* ../hou/dsolib/.
-cp -r dsolib/libIex* ../hou/dsolib/.
-cp -r dsolib/libImath* ../hou/dsolib/.
-cp -r dsolib/libIlmThread* ../hou/dsolib/.
-
-if [ "$HOUDINI_MAJOR" == "19.0" ]; then
-    cp -r dsolib/libHalf* ../hou/dsolib/.
-    cp -r dsolib/libIlmImf* ../hou/dsolib/.
+if [[ $PLATFORM =~ "linux" ]]; then
+    # create dir hierarchy
+    mkdir -p hou/bin
+    mkdir -p hou/houdini
+    mkdir -p hou/toolkit
+    mkdir -p hou/dsolib
+
+    # unpack hou.tar.gz and cleanup
+    tar -xzf hou.tar.gz
+    rm -rf hou.tar.gz
+    cd houdini*
+    tar -xzf houdini.tar.gz
+
+    # copy required files into hou dir
+    cp houdini_setup* ../hou/.
+
+    # report library names
+    ls -al dsolib/
+
+    # copy required libraries
+    cp -r toolkit/cmake ../hou/toolkit/.
+    cp -r toolkit/include ../hou/toolkit/.
+    cp -r dsolib/libHoudini* ../hou/dsolib/.
+    cp -r dsolib/libopenvdb_sesi* ../hou/dsolib/.
+    cp -r dsolib/libblosc* ../hou/dsolib/.
+    cp -r dsolib/libhboost* ../hou/dsolib/.
+    cp -r dsolib/libz* ../hou/dsolib/.
+    cp -r dsolib/libbz2* ../hou/dsolib/.
+    cp -r dsolib/libtbb* ../hou/dsolib/.
+    cp -r dsolib/libjemalloc* ../hou/dsolib/.
+    cp -r dsolib/liblzma* ../hou/dsolib/.
+    cp -r dsolib/libIex* ../hou/dsolib/.
+    cp -r dsolib/libImath* ../hou/dsolib/.
+    cp -r dsolib/libIlmThread* ../hou/dsolib/.
+    cd ..
+
+elif [[ $PLATFORM =~ "macos" ]]; then
+    # Exract files by mounting the downloaded dmg (we only really want to
+    # expand Houdini.framework)
+    hdiutil attach hou.dmg
+    pkgutil --expand-full /Volumes/Houdini/Houdini.pkg Houdini
+    hdiutil detach /Volumes/Houdini
+    rm hou.dmg
+
+    # Move the required Frameworks and delete the extracted src
+    mkdir -p hou/Frameworks
+    mv Houdini/Framework.pkg/Payload/Houdini.framework hou/Frameworks/Houdini.framework
+    rm -rf Houdini
+
+    # Report library names
+    ls -al hou/Frameworks/Houdini.framework/Libraries
+
+    # Remove unused resources
+    cd hou/Frameworks/Houdini.framework/Resources/
+    rm -rf $(ls | grep -e toolkit -v)
+    cd -
+
+    # Handle libraries. On some versions of MacOS with older versions of ld,
+    # ld will complain (error) if shared libraries contain missing files which
+    # are referenced with LC_LOAD_DYLIB or LC_RPATH entries (even though they
+    # are not explicitly required at link time). We still want to delete these
+    # unused libs as they occupt ~1-2GB. To handle this, we generate a unique
+    # list of libs that our direct dependencies reference and create an empty
+    # shared dylib in their place.
+    cd hou/Frameworks/Houdini.framework/Libraries
+    # Remove any folders here, they aren't needed
+    rm -rf $(ls -p | grep /)
+    # Remove any library that does not match the -e patterns (inverse grep with -v)
+    unused_libraries=$(ls | \
+        grep -e libHoudini \
+             -e libopenvdb_sesi \
+             -e libblosc \
+             -e libhboost \
+             -e libz \
+             -e libbz2 \
+             -e libtbb \
+             -e libjemalloc \
+             -e liblzma \
+             -e libIex \
+             -e libImath \
+             -e libIlmThread \
+             -v)
+    rm -rf ${unused_libraries}
+
+    # Create an empty valid shared lib
+    echo '' | clang -x c -shared -o libempty.dylib -
+
+    # Generate a unique list of libs that our remaining libs reference
+    for i in $(ls); do otool -LX $i >> libnames; done
+    sort -u libnames | grep @rpath | cut -f1 -d' ' | xargs > rpaths
+
+    # Recreate unused libraries that have been deleted as empty shared dylibs
+    # to keep ld happy
+    for libpath in $(cat rpaths); do
+        libpath=${libpath#"@rpath/"}
+        echo "Checking $libpath"
+        if [ ! -f $libpath ]; then
+            echo "Creating empty library at $libpath"
+            mkdir -p $(dirname $libpath)
+            cp libempty.dylib $libpath
+        fi
+    done
+
+    rm libempty.dylib
+    cd -
 fi
 
 # write hou into hou.tar.gz and cleanup
-cd ..
 tar -czvf hou.tar.gz hou
 
 # move hou.tar.gz into hou subdirectory
diff --git a/ci/install_macos.sh b/ci/install_macos.sh
index 20a2d6f4ad..0d7899687d 100755
--- a/ci/install_macos.sh
+++ b/ci/install_macos.sh
@@ -2,6 +2,13 @@
 
 set -x
 
+# Remove Python3 symlinks in /usr/local/bin as workaround to brew update issues
+# https://github.com/actions/setup-python/issues/577
+rm /usr/local/bin/2to3* || :
+rm /usr/local/bin/idle3* || :
+rm /usr/local/bin/pydoc* || :
+rm /usr/local/bin/python3* || :
+
 brew update
 brew install bash gnu-getopt # for CI scripts
 brew install boost
@@ -12,8 +19,8 @@ brew install googletest
 brew install jq # for trivial parsing of brew json
 brew install openexr
 brew install pybind11 # also installs the dependent python version
-brew install tbb
 brew install zlib
+brew install jemalloc
 
 # Alias python version installed by pybind11 to path
 py_version=$(brew info pybind11 --json | \
@@ -23,8 +30,13 @@ echo "Using python $py_version"
 echo "Python_ROOT_DIR=/usr/local/opt/$py_version" >> $GITHUB_ENV
 echo "/usr/local/opt/$py_version/bin" >> $GITHUB_PATH
 
+# use bash
+echo "/usr/local/opt/bash/bin" >> $GITHUB_PATH
+echo "/opt/homebrew/opt/bash/bin" >> $GITHUB_PATH
+
 # use gnu-getopt
 echo "/usr/local/opt/gnu-getopt/bin" >> $GITHUB_PATH
+echo "/opt/homebrew/opt/gnu-getopt/bin" >> $GITHUB_PATH
 
 LLVM_VERSION=$1
 if [ ! -z "$LLVM_VERSION" ]; then
diff --git a/ci/install_tbb_macos.sh b/ci/install_tbb_macos.sh
new file mode 100755
index 0000000000..4b71fe9e93
--- /dev/null
+++ b/ci/install_tbb_macos.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+set -x
+
+brew update
+brew install tbb
diff --git a/ci/install_windows.sh b/ci/install_windows.sh
index 77091a976a..9620ee42a7 100755
--- a/ci/install_windows.sh
+++ b/ci/install_windows.sh
@@ -1,8 +1,44 @@
 #!/usr/bin/env bash
 
-set -ex
+set -x
+set -e
 
+# Required dependencies
+VCPKG_INSTALL_CMD="vcpkg install
+    zlib
+    libpng
+    openexr
+    tbb
+    gtest
+    cppunit
+    blosc
+    glfw3
+    glew
+    python3
+    jemalloc
+    boost-iostreams
+    boost-interprocess
+    boost-algorithm
+    pybind11
+    --clean-after-build"
+
+# Update vcpkg
 vcpkg update
-vcpkg install zlib libpng openexr tbb gtest blosc glfw3 glew python3 \
-    boost-iostreams boost-any boost-uuid boost-interprocess boost-algorithm pybind11 \
-    --clean-after-build
+
+# Allow the vcpkg command to fail once so we can retry with the latest
+set +e
+$VCPKG_INSTALL_CMD
+STATUS=$?
+
+# Subsequent commands cannot fail
+set -x
+
+if [ $STATUS -ne 0 ]; then
+  # Try once more with latest ports
+  echo "vcpkg install failed, retrying with latest ports..."
+  cd $VCPKG_INSTALLATION_ROOT && git pull && cd-
+  vcpkg update
+  $VCPKG_INSTALL_CMD
+fi
+
+echo "vcpkg install completed successfully"
diff --git a/ci/install_windows_cuda.ps1 b/ci/install_windows_cuda.ps1
index db8b49c79a..f365cca7e7 100644
--- a/ci/install_windows_cuda.ps1
+++ b/ci/install_windows_cuda.ps1
@@ -26,12 +26,13 @@ $CUDA_KNOWN_URLS = @{
     "11.2.1" = "https://developer.download.nvidia.com/compute/cuda/11.2.1/network_installers/cuda_11.2.1_win10_network.exe";
     "11.2.2" = "https://developer.download.nvidia.com/compute/cuda/11.2.2/network_installers/cuda_11.2.2_win10_network.exe";
     "11.3.0" = "https://developer.download.nvidia.com/compute/cuda/11.3.0/network_installers/cuda_11.3.0_win10_network.exe";
-    "11.6.2" = "https://developer.download.nvidia.com/compute/cuda/11.6.2/network_installers/cuda_11.6.2_windows_network.exe"
+    "11.6.2" = "https://developer.download.nvidia.com/compute/cuda/11.6.2/network_installers/cuda_11.6.2_windows_network.exe";
+    "12.4.0" = "https://developer.download.nvidia.com/compute/cuda/12.4.0/network_installers/cuda_12.4.0_windows_network.exe"
 }
 
 # @todo - change this to be based on _MSC_VER intead, or invert it to be CUDA keyed instead?
 $VISUAL_STUDIO_MIN_CUDA = @{
-    "2022" = "11.6";
+    "2022" = "12.4";
     "2019" = "10.1";
     "2017" = "10.0"; # Depends on which version of 2017! 9.0 to 10.0 depending on  version
     "2015" = "8.0"; # might support older, unsure.
diff --git a/cmake/FindBlosc.cmake b/cmake/FindBlosc.cmake
index a9c77ae5e7..9b873cfb63 100644
--- a/cmake/FindBlosc.cmake
+++ b/cmake/FindBlosc.cmake
@@ -188,22 +188,6 @@ list(APPEND _BLOSC_LIBRARYDIR_SEARCH_DIRS
   ${SYSTEM_LIBRARY_PATHS}
 )
 
-# Library suffix handling
-
-set(_BLOSC_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
-set(_BLOSC_ORIG_CMAKE_FIND_LIBRARY_PREFIXES ${CMAKE_FIND_LIBRARY_PREFIXES})
-
-if(MSVC)
-  if(BLOSC_USE_STATIC_LIBS)
-    set(CMAKE_FIND_LIBRARY_SUFFIXES ".lib")
-    set(CMAKE_FIND_LIBRARY_PREFIXES "${CMAKE_FIND_LIBRARY_PREFIXES};lib")
-  endif()
-else()
-  if(BLOSC_USE_STATIC_LIBS)
-    set(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
-  endif()
-endif()
-
 set(Blosc_LIB_COMPONENTS "")
 # NOTE: Search for debug version first (see vcpkg hack)
 list(APPEND BLOSC_BUILD_TYPES DEBUG RELEASE)
@@ -246,13 +230,6 @@ foreach(BUILD_TYPE ${BLOSC_BUILD_TYPES})
   set(CMAKE_IGNORE_PATH ${_BLOSC_CMAKE_IGNORE_PATH})
 endforeach()
 
-# Reset library suffix
-
-set(CMAKE_FIND_LIBRARY_SUFFIXES ${_BLOSC_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES})
-set(CMAKE_FIND_LIBRARY_PREFIXES ${_BLOSC_ORIG_CMAKE_FIND_LIBRARY_PREFIXES})
-unset(_BLOSC_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES)
-unset(_BLOSC_ORIG_CMAKE_FIND_LIBRARY_PREFIXES)
-
 if(Blosc_LIBRARY_DEBUG AND Blosc_LIBRARY_RELEASE)
   # if the generator is multi-config or if CMAKE_BUILD_TYPE is set for
   # single-config generators, set optimized and debug libraries
diff --git a/cmake/FindTBB.cmake b/cmake/FindTBB.cmake
index 8588aa0482..7c85bb7d9a 100644
--- a/cmake/FindTBB.cmake
+++ b/cmake/FindTBB.cmake
@@ -461,7 +461,9 @@ foreach(COMPONENT ${TBB_FIND_COMPONENTS})
         IMPORTED_CONFIGURATIONS RELEASE)
       set_target_properties(TBB::${COMPONENT} PROPERTIES
         IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "CXX"
-        IMPORTED_LOCATION_RELEASE "${Tbb_${COMPONENT}_LIBRARY_RELEASE}")
+        IMPORTED_LOCATION_RELEASE "${Tbb_${COMPONENT}_LIBRARY_RELEASE}"
+        MAP_IMPORTED_CONFIG_MINSIZEREL Release
+        MAP_IMPORTED_CONFIG_RELWITHDEBINFO Release)
     endif()
 
     # Debug location
diff --git a/cmake/OpenVDBHoudiniSetup.cmake b/cmake/OpenVDBHoudiniSetup.cmake
index 2de38b53f6..88325c931f 100644
--- a/cmake/OpenVDBHoudiniSetup.cmake
+++ b/cmake/OpenVDBHoudiniSetup.cmake
@@ -288,7 +288,11 @@ endif()
 
 # Jemalloc
 
-if(NOT JEMALLOC_LIBRARYDIR)
+# * On Mac OSX, linking against Jemalloc < 4.3.0 seg-faults with this error:
+#     malloc: *** malloc_zone_unregister() failed for 0xaddress
+#   As of Houdini 20, it still ships with Jemalloc 3.6.0, so don't expose it
+#   on Mac OSX (https://github.com/jemalloc/jemalloc/issues/420).
+if(NOT APPLE AND NOT JEMALLOC_LIBRARYDIR)
   set(JEMALLOC_LIBRARYDIR ${HOUDINI_LIB_DIR})
 endif()
 
@@ -335,68 +339,67 @@ if(NOT OPENVDB_HOUDINI_ABI)
 endif()
 
 # ------------------------------------------------------------------------
-#  Configure GCC CXX11 ABI
+#  Configure libstc++ CXX11 ABI
 # ------------------------------------------------------------------------
 
-if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-  if((CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 5.1) OR
-     (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.1))
-    message(STATUS "GCC >= 5.1 detected. Configuring GCC CXX11 ABI for Houdini compatibility...")
-
-    execute_process(COMMAND echo "#include <string>"
-      COMMAND ${CMAKE_CXX_COMPILER} "-x" "c++" "-E" "-dM" "-"
-      COMMAND grep "-F" "_GLIBCXX_USE_CXX11_ABI"
-      TIMEOUT 10
-      RESULT_VARIABLE QUERIED_GCC_CXX11_ABI_SUCCESS
-      OUTPUT_VARIABLE _GCC_CXX11_ABI)
-
-    set(GLIBCXX_USE_CXX11_ABI "UNKNOWN")
-
-    if(NOT QUERIED_GCC_CXX11_ABI_SUCCESS)
-      string(FIND ${_GCC_CXX11_ABI} "_GLIBCXX_USE_CXX11_ABI 0" GCC_OLD_CXX11_ABI)
-      string(FIND ${_GCC_CXX11_ABI} "_GLIBCXX_USE_CXX11_ABI 1" GCC_NEW_CXX11_ABI)
-      if(NOT (${GCC_OLD_CXX11_ABI} EQUAL -1))
-        set(GLIBCXX_USE_CXX11_ABI 0)
-      endif()
-      if(NOT (${GCC_NEW_CXX11_ABI} EQUAL -1))
-        set(GLIBCXX_USE_CXX11_ABI 1)
-      endif()
-    endif()
+if(UNIX AND NOT APPLE)
+  # Assume we're using libstdc++
+  message(STATUS "Configuring CXX11 ABI for Houdini compatibility...")
+
+  execute_process(COMMAND echo "#include <string>"
+    COMMAND ${CMAKE_CXX_COMPILER} "-x" "c++" "-E" "-dM" "-"
+    COMMAND grep "-F" "_GLIBCXX_USE_CXX11_ABI"
+    TIMEOUT 10
+    RESULT_VARIABLE QUERIED_GCC_CXX11_ABI_SUCCESS
+    OUTPUT_VARIABLE _GCC_CXX11_ABI)
+
+  set(GLIBCXX_USE_CXX11_ABI "UNKNOWN")
 
-    # Try and query the Houdini CXX11 ABI. Allow it to be provided by users to
-    # override this logic should Houdini's CMake ever change
-
-    if(NOT DEFINED HOUDINI_CXX11_ABI)
-      get_target_property(houdini_interface_compile_options
-        Houdini INTERFACE_COMPILE_OPTIONS)
-      set(HOUDINI_CXX11_ABI "UNKNOWN")
-      if("-D_GLIBCXX_USE_CXX11_ABI=0" IN_LIST houdini_interface_compile_options)
-        set(HOUDINI_CXX11_ABI 0)
-      elseif("-D_GLIBCXX_USE_CXX11_ABI=1" IN_LIST houdini_interface_compile_options)
-        set(HOUDINI_CXX11_ABI 1)
-      endif()
+  if(NOT QUERIED_GCC_CXX11_ABI_SUCCESS)
+    string(FIND ${_GCC_CXX11_ABI} "_GLIBCXX_USE_CXX11_ABI 0" GCC_OLD_CXX11_ABI)
+    string(FIND ${_GCC_CXX11_ABI} "_GLIBCXX_USE_CXX11_ABI 1" GCC_NEW_CXX11_ABI)
+    if(NOT (${GCC_OLD_CXX11_ABI} EQUAL -1))
+      set(GLIBCXX_USE_CXX11_ABI 0)
     endif()
+    if(NOT (${GCC_NEW_CXX11_ABI} EQUAL -1))
+      set(GLIBCXX_USE_CXX11_ABI 1)
+    endif()
+  endif()
 
-    message(STATUS "  GCC CXX11 ABI     : ${GLIBCXX_USE_CXX11_ABI}")
-    message(STATUS "  Houdini CXX11 ABI : ${HOUDINI_CXX11_ABI}")
+  # Try and query the Houdini CXX11 ABI. Allow it to be provided by users to
+  # override this logic should Houdini's CMake ever change
 
-    if(${HOUDINI_CXX11_ABI} STREQUAL "UNKNOWN")
-      message(WARNING "Unable to determine Houdini CXX11 ABI. Assuming newer ABI "
-        "has been used.")
+  if(NOT DEFINED HOUDINI_CXX11_ABI)
+    get_target_property(houdini_interface_compile_options
+      Houdini INTERFACE_COMPILE_OPTIONS)
+    set(HOUDINI_CXX11_ABI "UNKNOWN")
+    if("-D_GLIBCXX_USE_CXX11_ABI=0" IN_LIST houdini_interface_compile_options)
+      set(HOUDINI_CXX11_ABI 0)
+    elseif("-D_GLIBCXX_USE_CXX11_ABI=1" IN_LIST houdini_interface_compile_options)
       set(HOUDINI_CXX11_ABI 1)
     endif()
+  endif()
 
-    if(${GLIBCXX_USE_CXX11_ABI} EQUAL ${HOUDINI_CXX11_ABI})
-      message(STATUS "  Current CXX11 ABI matches Houdini configuration "
-        "(_GLIBCXX_USE_CXX11_ABI=${HOUDINI_CXX11_ABI}).")
-    else()
-      message(WARNING "A potential mismatch was detected between the CXX11 ABI "
-        "of GCC and Houdini. The following ABI configuration will be used: "
-        "-D_GLIBCXX_USE_CXX11_ABI=${HOUDINI_CXX11_ABI}. See: "
-        "https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html and "
-        "https://vfxplatform.com/#footnote-gcc6 for more information.")
-    endif()
+  message(STATUS "  GNU CXX11 ABI     : ${GLIBCXX_USE_CXX11_ABI}")
+  message(STATUS "  Houdini CXX11 ABI : ${HOUDINI_CXX11_ABI}")
+
+  if(${HOUDINI_CXX11_ABI} STREQUAL "UNKNOWN")
+    message(WARNING "Unable to determine Houdini CXX11 ABI. Assuming newer ABI "
+      "has been used.")
+    set(HOUDINI_CXX11_ABI 1)
+  endif()
 
-    add_definitions(-D_GLIBCXX_USE_CXX11_ABI=${HOUDINI_CXX11_ABI})
+  if(${GLIBCXX_USE_CXX11_ABI} EQUAL ${HOUDINI_CXX11_ABI})
+    message(STATUS "  Current CXX11 ABI matches Houdini configuration "
+      "(_GLIBCXX_USE_CXX11_ABI=${HOUDINI_CXX11_ABI}).")
+  else()
+    message(WARNING "A potential mismatch was detected between the CXX11 ABI "
+      "of libstdc++ and Houdini. The following ABI configuration will be used: "
+      "-D_GLIBCXX_USE_CXX11_ABI=${HOUDINI_CXX11_ABI}. See: "
+      "https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html and "
+      "https://vfxplatform.com/#footnote-gcc6 for more information.")
   endif()
+
+  add_definitions(-D_GLIBCXX_USE_CXX11_ABI=${HOUDINI_CXX11_ABI})
 endif()
+
diff --git a/cmake/config/OpenVDBCXX.cmake b/cmake/config/OpenVDBCXX.cmake
index 5ad5d18725..36490afe5a 100644
--- a/cmake/config/OpenVDBCXX.cmake
+++ b/cmake/config/OpenVDBCXX.cmake
@@ -245,7 +245,7 @@ if(CMAKE_BUILD_TYPE EQUAL coverage)
 endif()
 
 # Note that the thread, address and memory sanitizers are incompatible with each other
-set(EXTRA_BUILD_TYPES coverage tsan asan lsan msan ubsan)
+set(EXTRA_BUILD_TYPES coverage tsan asan lsan msan ubsan abicheck)
 
 # Set all build flags to empty (unless they have been provided)
 
@@ -304,12 +304,22 @@ add_link_options("$<$<AND:$<CONFIG:MSAN>,$<COMPILE_LANG_AND_ID:CXX,Clang,AppleCl
 add_compile_options("$<$<AND:$<CONFIG:UBSAN>,$<COMPILE_LANG_AND_ID:CXX,GNU,Clang,AppleClang>>:-fsanitize=undefined>")
 add_link_options("$<$<AND:$<CONFIG:UBSAN>,$<COMPILE_LANG_AND_ID:CXX,GNU,Clang,AppleClang>>:-fsanitize=undefined>")
 
+# ABI Check. This build type is expected to work with the abi-dumper/abi-compliance-checker
+# binaries which expect specific debug information. In particular, for GCC versions >= 11
+# we have to explicitly select dwarf versions < 5 as the abi-dumper doesn't support dwarf5
+# and will always incorrectly report successful ABI checks
+#   https://github.com/lvc/abi-dumper/issues/33
+add_compile_options("$<$<CONFIG:ABICHECK>:-gdwarf-4;-g3;-ggdb;-Og>")
+
 # CMAKE_BUILD_TYPE is ignored for multi config generators i.e. MSVS
 
 get_property(_isMultiConfig GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
 if(NOT _isMultiConfig)
   message(STATUS "CMake Build Type: ${CMAKE_BUILD_TYPE}")
 endif()
+if(OPENVDB_ENABLE_ASSERTS)
+  message(STATUS "OpenVDB asserts are ENABLED")
+endif()
 
 # Intialize extra build type targets where possible
 
diff --git a/cmake/scripts/lsan.supp b/cmake/scripts/lsan.supp
new file mode 100644
index 0000000000..d6b373aedf
--- /dev/null
+++ b/cmake/scripts/lsan.supp
@@ -0,0 +1,15 @@
+#################################################################################
+## This file is loaded by the Leak/Address Sanitizer build for the unit tests. ##
+## It can be used to ignore various errors reported by the sanitizer. This is  ##
+## especially useful with upstream issues (e.g. boost/tbb). For help defining  ##
+## suppression rules, see:                                                     ##
+##   https://clang.llvm.org/docs/AddressSanitizer.html                         ##
+## The build is configured with CMAKE_BUILD_TYPE=asan or lsan                  ##
+#################################################################################
+
+##### Upstream #####
+
+# Leaks from TBB init which occur due to tbb teardown issues
+#   https://github.com/oneapi-src/oneTBB/issues/206
+# Should be fixed in oneTBB. Ignore them for now
+leak:tbb::internal::task_stream<3>::initialize*
diff --git a/doc/nanovdb/SourceTree.md b/doc/nanovdb/SourceTree.md
index 6eb0a1dcc9..49bb8dc4b7 100644
--- a/doc/nanovdb/SourceTree.md
+++ b/doc/nanovdb/SourceTree.md
@@ -5,12 +5,12 @@
 * [NanoVDB.h](https://github.com/AcademySoftwareFoundation/openvdb/blob/master/nanovdb/nanovdb/NanoVDB.h) C++11 implementation of the core data structure and its access methods.
 * [CNanoVDB.h](https://github.com/AcademySoftwareFoundation/openvdb/blob/master/nanovdb/nanovdb/CNanoVDB.h) Incomplete but useable C99 implementation of the core data structure and its access methods.  Designed in particular for use in OpenCL kernels.  Note that this relies on zero-sized arrays for the _reserved padding, so will not work on all compilers (with MSVC being a particular example)
 * [PNanoVDB.h](https://github.com/AcademySoftwareFoundation/openvdb/blob/master/nanovdb/nanovdb/PNanoVDB.h) C99 implementation of the core data structure and its access methods. More complete coverage than CNanoVDB.  This version is pointer-less and supports virtually all graphics APIs.
-* [util/GridHandle.h](https://github.com/AcademySoftwareFoundation/openvdb/blob/master/nanovdb/nanovdb/util/GridHandle.h) defines a handler for the memory allocated to a NanoVDB grid.
-* [util/IO.h](https://github.com/AcademySoftwareFoundation/openvdb/blob/master/nanovdb/nanovdb/util/IO.h) implements I/O support.
-* [util/OpenToNanoVDB.h](https://github.com/AcademySoftwareFoundation/openvdb/blob/master/nanovdb/nanovdb/util/OpenToNanoVDB.h) defines the converter from OpenVDB to NanoVDB and obviously depends on the OpenVDB library (as the only header file).
-* [Ray.h](https://github.com/AcademySoftwareFoundation/openvdb/blob/master/nanovdb/nanovdb/util/Ray.h) Ray class.
-* [HDDA.h](https://github.com/AcademySoftwareFoundation/openvdb/blob/master/nanovdb/nanovdb/util/HDDA.h) HDDA related.
-* [SampleFromVoxels.h](https://github.com/AcademySoftwareFoundation/openvdb/blob/master/nanovdb/nanovdb/util/SampleFromVoxels.h) interpolation.
+* [GridHandle.h](https://github.com/AcademySoftwareFoundation/openvdb/blob/master/nanovdb/nanovdb/GridHandle.h) defines a handler for the memory allocated to a NanoVDB grid.
+* [io/IO.h](https://github.com/AcademySoftwareFoundation/openvdb/blob/master/nanovdb/nanovdb/io/IO.h) implements I/O support.
+* [tools/CreateNanoGrid.h](https://github.com/AcademySoftwareFoundation/openvdb/blob/master/nanovdb/nanovdb/tools/CreateNanoGrid.h) defines the converter from OpenVDB to NanoVDB and obviously depends on the OpenVDB library (as the only header file).
+* [math/Ray.h](https://github.com/AcademySoftwareFoundation/openvdb/blob/master/nanovdb/nanovdb/math/Ray.h) Ray class.
+* [math/HDDA.h](https://github.com/AcademySoftwareFoundation/openvdb/blob/master/nanovdb/nanovdb/math/HDDA.h) HDDA related.
+* [math/SampleFromVoxels.h](https://github.com/AcademySoftwareFoundation/openvdb/blob/master/nanovdb/nanovdb/math/SampleFromVoxels.h) interpolation.
 
 ```bash
 foo@bar:~$ tree
@@ -22,25 +22,19 @@ foo@bar:~$ tree
 │   │   └── nanovdb_convert.cc
 │   ├── print
 │   │   └── nanovdb_print.cc
+│   ├── updateFiles.sh
 │   └── validate
 │       └── nanovdb_validate.cc
 ├── CNanoVDB.h
+├── cuda
+│   ├── DeviceBuffer.h
+│   ├── GridHandle.cuh
+│   └── NodeManager.cuh
 ├── docs
 │   ├── CMakeLists.txt
 │   ├── codingstyle.txt
 │   └── doxygen-config
 ├── examples
-│   ├── benchmark
-│   │   ├── BenchKernels_dense.cu
-│   │   ├── BenchKernels_nano.cu
-│   │   ├── Benchmark_dense.cu
-│   │   ├── Benchmark_nano.cu
-│   │   ├── Camera.h
-│   │   ├── CMakeLists.txt
-│   │   ├── DenseGrid.h
-│   │   ├── Image.h
-│   │   ├── TestBenchmark.cc
-│   │   └── TestBenchmark.cu
 │   ├── CMakeLists.txt
 │   ├── ex_bump_pool_buffer
 │   │   └── bump_pool_buffer.cc
@@ -50,7 +44,7 @@ foo@bar:~$ tree
 │   │   ├── nanovdb.cu
 │   │   └── openvdb.cc
 │   ├── ex_index_grid_cuda
-│   │   ├── index_grid_cuda.cu
+│   │   ├── index_grid_cuda.cc
 │   │   └── index_grid_cuda_kernel.cu
 │   ├── ex_make_custom_nanovdb
 │   │   └── make_custom_nanovdb.cc
@@ -66,6 +60,7 @@ foo@bar:~$ tree
 │   ├── ex_map_pool_buffer
 │   │   └── map_pool_buffer.cc
 │   ├── ex_modify_nanovdb_thrust
+│   │   ├── modify_nanovdb_thrust.cc
 │   │   └── modify_nanovdb_thrust.cu
 │   ├── ex_nodemanager_cuda
 │   │   ├── nodemanager_cuda.cc
@@ -103,9 +98,38 @@ foo@bar:~$ tree
 │   │   └── VoxToNanoVDB.h
 │   └── ex_write_nanovdb_grids
 │       └── write_nanovdb_grids.cc
+├── GridHandle.h
+├── HostBuffer.h
+├── io
+│   └── IO.h
+├── math
+│   ├── CSampleFromVoxels.h
+│   ├── DitherLUT.h
+│   ├── HDDA.h
+│   ├── Math.h
+│   ├── Ray.h
+│   ├── SampleFromVoxels.h
+│   └── Stencils.h
 ├── NanoVDB.h
+├── NodeManager.h
 ├── PNanoVDB.h
 ├── Readme.md
+├── tools
+│   ├── CreateNanoGrid.h
+│   ├── CreatePrimitives.h
+│   ├── cuda
+│   │   ├── AddBlindData.cuh
+│   │   ├── GridChecksum.cuh
+│   │   ├── GridStats.cuh
+│   │   ├── GridValidator.cuh
+│   │   ├── IndexToGrid.cuh
+│   │   ├── PointsToGrid.cuh
+│   │   └── SignedFloodFill.cuh
+│   ├── GridBuilder.h
+│   ├── GridChecksum.h
+│   ├── GridStats.h
+│   ├── GridValidator.h
+│   └── NanoToOpenVDB.h
 ├── unittest
 │   ├── CMakeLists.txt
 │   ├── pnanovdb_validate_strides.h
@@ -115,21 +139,25 @@ foo@bar:~$ tree
 └── util
     ├── CpuTimer.h
     ├── CreateNanoGrid.h
-    ├── CSampleFromVoxels.h
     ├── cuda
     │   ├── CudaAddBlindData.cuh
     │   ├── CudaDeviceBuffer.h
+    │   ├── CudaGridChecksum.cuh
     │   ├── CudaGridHandle.cuh
+    │   ├── CudaGridStats.cuh
+    │   ├── CudaGridValidator.cuh
     │   ├── CudaIndexToGrid.cuh
+    │   ├── CudaNodeManager.cuh
     │   ├── CudaPointsToGrid.cuh
     │   ├── CudaSignedFloodFill.cuh
     │   ├── CudaUtils.h
-    │   └── GpuTimer.cuh
+    │   ├── GpuTimer.h
+    │   ├── Timer.h
+    │   └── Util.h
     ├── DitherLUT.h
     ├── ForEach.h
     ├── GridBuilder.h
     ├── GridChecksum.h
-    ├── GridHandle.h
     ├── GridStats.h
     ├── GridValidator.h
     ├── HDDA.h
@@ -145,5 +173,6 @@ foo@bar:~$ tree
     ├── Ray.h
     ├── Reduce.h
     ├── SampleFromVoxels.h
-    └── Stencils.h
-```
+    ├── Stencils.h
+    ├── Timer.h
+    └── Util.h
\ No newline at end of file
diff --git a/nanovdb/nanovdb/CMakeLists.txt b/nanovdb/nanovdb/CMakeLists.txt
index 7bb3ab862d..5ef70a9fc1 100644
--- a/nanovdb/nanovdb/CMakeLists.txt
+++ b/nanovdb/nanovdb/CMakeLists.txt
@@ -160,31 +160,66 @@ endif()
 # NanoVDB header files
 set(NANOVDB_INCLUDE_FILES
   CNanoVDB.h
+  GridHandle.h
+  HostBuffer.h
   NanoVDB.h
+  NodeManager.h
   PNanoVDB.h
 )
 
+# NanoVDB cuda header files
+set(NANOVDB_INCLUDE_CUDA_FILES
+  cuda/DeviceBuffer.h
+  cuda/GridHandle.cuh
+  cuda/NodeManager.cuh
+)
+
+# NanoVDB io header files
+set(NANOVDB_INCLUDE_IO_FILES
+  io/IO.h
+)
+
+# NanoVDB math header files
+set(NANOVDB_INCLUDE_MATH_FILES
+  math/CSampleFromVoxels.h
+  math/DitherLUT.h
+  math/HDDA.h
+  math/Math.h
+  math/Ray.h
+  math/SampleFromVoxels.h
+  math/Stencils.h
+)
+
+# NanoVDB tools header files
+set(NANOVDB_INCLUDE_TOOLS_FILES
+  tools/CreateNanoGrid.h
+  tools/CreatePrimitives.h
+  tools/GridBuilder.h
+  tools/GridChecksum.h
+  tools/GridStats.h
+  tools/GridValidator.h
+  tools/NanoToOpenVDB.h
+)
+
+# NanoVDB tools/cuda header files
+set(NANOVDB_INCLUDE_TOOLS_CUDA_FILES
+  tools/cuda/AddBlindData.cuh
+  tools/cuda/GridChecksum.cuh
+  tools/cuda/GridStats.cuh
+  tools/cuda/GridValidator.cuh
+  tools/cuda/IndexToGrid.cuh
+  tools/cuda/PointsToGrid.cuh
+  tools/cuda/SignedFloodFill.cuh
+)
+
 # NanoVDB util header files
-set(NANOVDB_INCLUDE_UTILFILES
+set(NANOVDB_INCLUDE_UTIL_FILES
   util/CpuTimer.h
   util/CreateNanoGrid.h
-  util/CSampleFromVoxels.h
-  util/cuda/CudaAddBlindData.cuh
-  util/cuda/CudaDeviceBuffer.h
-  util/cuda/CudaGridChecksum.cuh
-  util/cuda/CudaGridHandle.cuh
-  util/cuda/CudaGridStats.cuh
-  util/cuda/CudaIndexToGrid.cuh
-  util/cuda/CudaNodeManager.cuh
-  util/cuda/CudaPointsToGrid.cuh
-  util/cuda/CudaSignedFloodFill.cuh
-  util/cuda/CudaUtils.h
-  util/cuda/GpuTimer.h
   util/DitherLUT.h
   util/ForEach.h
   util/GridBuilder.h
   util/GridChecksum.h
-  util/GridHandle.h
   util/GridStats.h
   util/GridValidator.h
   util/HDDA.h
@@ -201,6 +236,26 @@ set(NANOVDB_INCLUDE_UTILFILES
   util/Reduce.h
   util/SampleFromVoxels.h
   util/Stencils.h
+  util/Timer.h
+  util/Util.h
+)
+
+# NanoVDB util/cuda header files
+set(NANOVDB_INCLUDE_UTIL_CUDA_FILES
+  util/cuda/CudaAddBlindData.cuh
+  util/cuda/CudaGridHandle.cuh
+  util/cuda/CudaIndexToGrid.cuh
+  util/cuda/CudaSignedFloodFill.cuh
+  util/cuda/Timer.h
+  util/cuda/CudaDeviceBuffer.h
+  util/cuda/CudaGridStats.cuh
+  util/cuda/CudaNodeManager.cuh
+  util/cuda/CudaUtils.h
+  util/cuda/Util.h
+  util/cuda/CudaGridChecksum.cuh
+  util/cuda/CudaGridValidator.cuh
+  util/cuda/CudaPointsToGrid.cuh
+  util/cuda/GpuTimer.h
 )
 
 add_library(nanovdb INTERFACE)
@@ -266,11 +321,23 @@ if(TARGET Threads::Threads)
   target_link_libraries(nanovdb INTERFACE Threads::Threads)
 endif()
 
-set(NANOVDB_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR}/nanovdb)
-set(NANOVDB_INSTALL_UTILDIR ${NANOVDB_INSTALL_INCLUDEDIR}/util)
-
-install(FILES ${NANOVDB_INCLUDE_FILES} DESTINATION ${NANOVDB_INSTALL_INCLUDEDIR})
-install(FILES ${NANOVDB_INCLUDE_UTILFILES} DESTINATION ${NANOVDB_INSTALL_UTILDIR})
+set(NANOVDB_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR}/nanovdb)
+set(NANOVDB_INSTALL_CUDA_DIR ${NANOVDB_INSTALL_INCLUDE_DIR}/cuda)
+set(NANOVDB_INSTALL_IO_DIR ${NANOVDB_INSTALL_INCLUDE_DIR}/io)
+set(NANOVDB_INSTALL_MATH_DIR ${NANOVDB_INSTALL_INCLUDE_DIR}/math)
+set(NANOVDB_INSTALL_TOOLS_DIR ${NANOVDB_INSTALL_INCLUDE_DIR}/tools)
+set(NANOVDB_INSTALL_TOOLS_CUDA_DIR ${NANOVDB_INSTALL_TOOLS_DIR}/cuda)
+set(NANOVDB_INSTALL_UTIL_DIR ${NANOVDB_INSTALL_INCLUDE_DIR}/util)
+set(NANOVDB_INSTALL_UTIL_CUDA_DIR ${NANOVDB_INSTALL_UTIL_DIR}/cuda)
+
+install(FILES ${NANOVDB_INCLUDE_FILES} DESTINATION ${NANOVDB_INSTALL_INCLUDE_DIR})
+install(FILES ${NANOVDB_INCLUDE_CUDA_FILES} DESTINATION ${NANOVDB_INSTALL_CUDA_DIR})
+install(FILES ${NANOVDB_INCLUDE_IO_FILES} DESTINATION ${NANOVDB_INSTALL_IO_DIR})
+install(FILES ${NANOVDB_INCLUDE_MATH_FILES} DESTINATION ${NANOVDB_INSTALL_MATH_DIR})
+install(FILES ${NANOVDB_INCLUDE_TOOLS_FILES} DESTINATION ${NANOVDB_INSTALL_TOOLS_DIR})
+install(FILES ${NANOVDB_INCLUDE_TOOLS_CUDA_FILES} DESTINATION ${NANOVDB_INSTALL_TOOLS_CUDA_DIR})
+install(FILES ${NANOVDB_INCLUDE_UTIL_FILES} DESTINATION ${NANOVDB_INSTALL_UTIL_DIR})
+install(FILES ${NANOVDB_INCLUDE_UTIL_CUDA_FILES} DESTINATION ${NANOVDB_INSTALL_UTIL_CUDA_DIR})
 
 ###############################################################################
 # Options
diff --git a/nanovdb/nanovdb/util/GridHandle.h b/nanovdb/nanovdb/GridHandle.h
similarity index 89%
rename from nanovdb/nanovdb/util/GridHandle.h
rename to nanovdb/nanovdb/GridHandle.h
index 14094fbe69..a3d868e8be 100644
--- a/nanovdb/nanovdb/util/GridHandle.h
+++ b/nanovdb/nanovdb/GridHandle.h
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: MPL-2.0
 
 /*!
-    \file GridHandle.h
+    \file nanovdb/GridHandle.h
 
     \author Ken Museth
 
@@ -20,9 +20,9 @@
 #include <vector>
 #include <initializer_list>
 
-#include <nanovdb/NanoVDB.h>// for mapToGridType
-#include <nanovdb/util/HostBuffer.h>
-#include <nanovdb/util/GridChecksum.h>// for updateGridCount
+#include <nanovdb/NanoVDB.h>// for toGridType
+#include <nanovdb/HostBuffer.h>
+#include <nanovdb/tools/GridChecksum.h>// for updateGridCount
 
 namespace nanovdb {
 
@@ -48,13 +48,13 @@ class GridHandle
     /// @brief  Move constructor from a host buffer
     /// @param buffer buffer containing one or more NanoGrids that will be moved into this GridHandle
     /// @throw Will throw and error with the buffer does not contain a valid NanoGrid!
-    template<typename T = BufferT, typename enable_if<BufferTraits<T>::hasDeviceDual, int>::type = 0>
+    template<typename T = BufferT, typename util::enable_if<BufferTraits<T>::hasDeviceDual, int>::type = 0>
     GridHandle(T&& buffer);
 
     /// @brief  Move constructor from a dual host-device buffer
     /// @param buffer buffer containing one or more NanoGrids that will be moved into this GridHandle
     /// @throw Will throw and error with the buffer does not contain a valid NanoGrid!
-    template<typename T = BufferT, typename disable_if<BufferTraits<T>::hasDeviceDual, int>::type = 0>
+    template<typename T = BufferT, typename util::disable_if<BufferTraits<T>::hasDeviceDual, int>::type = 0>
     GridHandle(T&& buffer);
 
     /// @brief Constructs an empty GridHandle
@@ -100,17 +100,17 @@ class GridHandle
 
     /// @brief Returns a non-const pointer to the data.
     /// @warning Note that the return pointer can be NULL if the GridHandle was not initialized
-    uint8_t* data() { return mBuffer.data(); }
+    void* data() { return mBuffer.data(); }
 
     /// @brief Returns a const pointer to the data.
     /// @warning Note that the return pointer can be NULL if the GridHandle was not initialized
-    const uint8_t* data() const { return mBuffer.data(); }
+    const void* data() const { return mBuffer.data(); }
 
     template<typename U = BufferT>
-    typename enable_if<BufferTraits<U>::hasDeviceDual, const uint8_t*>::type
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, const void*>::type
     deviceData() const { return mBuffer.deviceData(); }
     template<typename U = BufferT>
-    typename enable_if<BufferTraits<U>::hasDeviceDual, uint8_t*>::type
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, void*>::type
     deviceData() { return mBuffer.deviceData(); }
 
     /// @brief Returns the size in bytes of the raw memory buffer managed by this GridHandle.
@@ -147,7 +147,7 @@ class GridHandle
     /// @warning Note that the return pointer can be NULL if the GridHandle has no device grid, @a n is invalid,
     ///          or if the template parameter does not match the specified grid.
     template<typename ValueT, typename U = BufferT>
-    typename enable_if<BufferTraits<U>::hasDeviceDual, const NanoGrid<ValueT>*>::type
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, const NanoGrid<ValueT>*>::type
     deviceGrid(uint32_t n=0) const;
 
     /// @brief Return a const pointer to the @a n'th grid encoded in this GridHandle on the device, e.g. GPU
@@ -157,19 +157,19 @@ class GridHandle
     /// @warning Note that the return pointer can be NULL if the GridHandle was not initialized, @a n is invalid,
     ///          or if the template parameter does not match the specified grid.
     template<typename ValueT, typename U = BufferT>
-    typename enable_if<BufferTraits<U>::hasDeviceDual, NanoGrid<ValueT>*>::type
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, NanoGrid<ValueT>*>::type
     deviceGrid(uint32_t n=0){return const_cast<NanoGrid<ValueT>*>(static_cast<const GridHandle*>(this)->template deviceGrid<ValueT>(n));}
 
     /// @brief Upload the grid to the device, e.g. from CPU to GPU
     /// @note This method is only available if the buffer supports devices
     template<typename U = BufferT>
-    typename enable_if<BufferTraits<U>::hasDeviceDual, void>::type
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, void>::type
     deviceUpload(void* stream = nullptr, bool sync = true) { mBuffer.deviceUpload(stream, sync); }
 
     /// @brief Download the grid to from the device, e.g. from GPU to CPU
     /// @note This method is only available if the buffer supports devices
     template<typename U = BufferT>
-    typename enable_if<BufferTraits<U>::hasDeviceDual, void>::type
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, void>::type
     deviceDownload(void* stream = nullptr, bool sync = true) { mBuffer.deviceDownload(stream, sync); }
 
     /// @brief Check if the buffer is this handle has any padding, i.e. if the buffer is larger than the combined size of all its grids
@@ -292,41 +292,39 @@ class GridHandle
 template<typename BufferT>
 inline const GridData* GridHandle<BufferT>::gridData(uint32_t n) const
 {
-    const uint8_t *data = this->data();
+    const void *data = this->data();
     if (data == nullptr || n >= mMetaData.size()) return nullptr;
-    return reinterpret_cast<const GridData*>(data + mMetaData[n].offset);
+    return util::PtrAdd<GridData>(data, mMetaData[n].offset);
 }// const GridData* GridHandle<BufferT>::gridData(uint32_t n) const
 
 template<typename BufferT>
 inline const GridMetaData* GridHandle<BufferT>::gridMetaData(uint32_t n) const
 {
-    const uint8_t *data = this->data();
+    const auto *data = this->data();
     if (data == nullptr || n >= mMetaData.size()) return nullptr;
-    return reinterpret_cast<const GridMetaData*>(data + mMetaData[n].offset);
+    return util::PtrAdd<GridMetaData>(data, mMetaData[n].offset);
 }// const GridMetaData* GridHandle<BufferT>::gridMetaData(uint32_t n) const
 
-namespace {// anonymous namespace
-inline __hostdev__ void cpyMetaData(const GridData *data, GridHandleMetaData *meta)
+inline __hostdev__ void cpyGridHandleMeta(const GridData *data, GridHandleMetaData *meta)
 {
     uint64_t offset = 0;
     for (auto *p=meta, *q=p+data->mGridCount; p!=q; ++p) {
         *p = {offset,  data->mGridSize, data->mGridType};
         offset += p->size;
-        data = PtrAdd<const GridData>(data, p->size);
+        data = util::PtrAdd<GridData>(data, p->size);
     }
-}// void cpyMetaData(const GridData *data, GridHandleMetaData *meta)
-}// anonymous namespace
+}// void cpyGridHandleMeta(const GridData *data, GridHandleMetaData *meta)
 
 template<typename BufferT>
-template<typename T, typename disable_if<BufferTraits<T>::hasDeviceDual, int>::type>
+template<typename T, typename util::disable_if<BufferTraits<T>::hasDeviceDual, int>::type>
 GridHandle<BufferT>::GridHandle(T&& buffer)
 {
-    static_assert(is_same<T,BufferT>::value, "Expected U==BufferT");
+    static_assert(util::is_same<T,BufferT>::value, "Expected U==BufferT");
     mBuffer = std::move(buffer);
     if (auto *data = reinterpret_cast<const GridData*>(mBuffer.data())) {
         if (!data->isValid()) throw std::runtime_error("GridHandle was constructed with an invalid host buffer");
         mMetaData.resize(data->mGridCount);
-        cpyMetaData(data, mMetaData.data());
+        cpyGridHandleMeta(data, mMetaData.data());
     }
 }// GridHandle<BufferT>::GridHandle(T&& buffer)
 
@@ -344,19 +342,19 @@ template<typename BufferT>
 template<typename ValueT>
 inline const NanoGrid<ValueT>* GridHandle<BufferT>::grid(uint32_t n) const
 {
-    const uint8_t *data = mBuffer.data();
-    if (data == nullptr || n >= mMetaData.size() || mMetaData[n].gridType != mapToGridType<ValueT>()) return nullptr;
-    return reinterpret_cast<const NanoGrid<ValueT>*>(data + mMetaData[n].offset);
+    const void *data = mBuffer.data();
+    if (data == nullptr || n >= mMetaData.size() || mMetaData[n].gridType != toGridType<ValueT>()) return nullptr;
+    return util::PtrAdd<NanoGrid<ValueT>>(data, mMetaData[n].offset);
 }// const NanoGrid<ValueT>* GridHandle<BufferT>::grid(uint32_t n) const
 
 template<typename BufferT>
 template<typename ValueT, typename U>
-inline typename enable_if<BufferTraits<U>::hasDeviceDual, const NanoGrid<ValueT>*>::type
+inline typename util::enable_if<BufferTraits<U>::hasDeviceDual, const NanoGrid<ValueT>*>::type
 GridHandle<BufferT>::deviceGrid(uint32_t n) const
 {
-    const uint8_t *data = mBuffer.deviceData();
-    if (data == nullptr || n >= mMetaData.size() || mMetaData[n].gridType != mapToGridType<ValueT>()) return nullptr;
-    return reinterpret_cast<const NanoGrid<ValueT>*>(data + mMetaData[n].offset);
+    const void *data = mBuffer.deviceData();
+    if (data == nullptr || n >= mMetaData.size() || mMetaData[n].gridType != toGridType<ValueT>()) return nullptr;
+    return util::PtrAdd<NanoGrid<ValueT>>(data, mMetaData[n].offset);
 }// GridHandle<BufferT>::deviceGrid(uint32_t n) cons
 
 template<typename BufferT>
@@ -395,7 +393,7 @@ void GridHandle<BufferT>::read(std::istream& is, uint32_t n, const BufferT& pool
         auto buffer = BufferT::create(data.mGridSize, &pool);
         is.seekg(-sizeof(GridData), std::ios::cur);// rewind
         is.read((char*)(buffer.data()), data.mGridSize);
-        updateGridCount((GridData*)buffer.data(), 0u, 1u);
+        tools::updateGridCount((GridData*)buffer.data(), 0u, 1u);
         *this = GridHandle(std::move(buffer));
     } else {
         is.seekg(-sizeof(GridData), std::ios::cur);// rewind sizeof(GridData) bytes to undo initial read
@@ -420,7 +418,7 @@ void GridHandle<BufferT>::read(std::istream& is, const std::string &gridName, co
         if (n>data.mGridCount) throw std::runtime_error("No raw grid named \""+gridName+"\"");
         auto buffer = BufferT::create(data.mGridSize, &pool);
         is.read((char*)(buffer.data()), data.mGridSize);
-        updateGridCount((GridData*)buffer.data(), 0u, 1u);
+        tools::updateGridCount((GridData*)buffer.data(), 0u, 1u);
         *this = GridHandle(std::move(buffer));
     } else {
         throw std::logic_error("This file does not contain a valid raw buffer");
@@ -439,7 +437,7 @@ inline VectorT<GridHandle<BufferT>>
 splitGrids(const GridHandle<BufferT> &handle, const BufferT* other = nullptr)
 {
     using HandleT = GridHandle<BufferT>;
-    const uint8_t *ptr = handle.data();
+    const void *ptr = handle.data();
     if (ptr == nullptr) return VectorT<HandleT>();
     VectorT<HandleT> handles(handle.gridCount());
     for (auto &h : handles) {
@@ -448,9 +446,9 @@ splitGrids(const GridHandle<BufferT> &handle, const BufferT* other = nullptr)
         auto buffer = BufferT::create(src->mGridSize, other);
         GridData *dst = reinterpret_cast<GridData*>(buffer.data());
         std::memcpy(dst, src, src->mGridSize);
-        updateGridCount(dst, 0u, 1u);
+        tools::updateGridCount(dst, 0u, 1u);
         h = HandleT(std::move(buffer));
-        ptr += src->mGridSize;
+        ptr = util::PtrAdd(ptr, src->mGridSize);
     }
     return std::move(handles);
 }// splitGrids
@@ -471,16 +469,16 @@ mergeGrids(const VectorT<GridHandle<BufferT>> &handles, const BufferT* pool = nu
         for (uint32_t n=0; n<h.gridCount(); ++n) size += h.gridSize(n);
     }
     auto buffer = BufferT::create(size, pool);
-    uint8_t *dst = buffer.data();
+    void *dst = buffer.data();
     for (auto &h : handles) {
-        const uint8_t *src = h.data();
+        const void *src = h.data();
         for (uint32_t n=0; n<h.gridCount(); ++n) {
             std::memcpy(dst, src, h.gridSize(n));
             GridData *data = reinterpret_cast<GridData*>(dst);
             NANOVDB_ASSERT(data->isValid());
-            updateGridCount(data, counter++, gridCount);
-            dst += data->mGridSize;
-            src += data->mGridSize;
+            tools::updateGridCount(data, counter++, gridCount);
+            dst = util::PtrAdd(dst, data->mGridSize);
+            src = util::PtrAdd(src, data->mGridSize);
         }
     }
     return GridHandle<BufferT>(std::move(buffer));
@@ -489,7 +487,7 @@ mergeGrids(const VectorT<GridHandle<BufferT>> &handles, const BufferT* pool = nu
 } // namespace nanovdb
 
 #if defined(__CUDACC__)
-#include <nanovdb/util/cuda/CudaGridHandle.cuh>
+#include <nanovdb/cuda/GridHandle.cuh>
 #endif// defined(__CUDACC__)
 
 #endif // NANOVDB_GRID_HANDLE_H_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/HostBuffer.h b/nanovdb/nanovdb/HostBuffer.h
new file mode 100644
index 0000000000..c664856a07
--- /dev/null
+++ b/nanovdb/nanovdb/HostBuffer.h
@@ -0,0 +1,590 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    @file nanovdb/HostBuffer.h
+
+    @date April 20, 2021
+
+    @brief HostBuffer - a buffer that contains a shared or private bump
+           pool to either externally or internally managed host memory.
+
+    @details This HostBuffer can be used in multiple ways, most of which are
+             demonstrated in the examples below. Memory in the pool can
+             be managed or unmanged (e.g. internal or external) and can
+             be shared between multiple buffers or belong to a single buffer.
+
+   Example that uses HostBuffer::create inside io::readGrids to create a
+   full self-managed buffer, i.e. not shared and without padding, per grid in the file.
+   @code
+        auto handles = nanovdb::io::readGrids("file.nvdb");
+   @endcode
+
+   Example that uses HostBuffer::createFull. Assuming you have a raw pointer
+   to a NanoVDB grid of unknown type, this examples shows how to create its
+   GridHandle which can be used to enquire about the grid type and meta data.
+   @code
+        void    *data;// pointer to a NanoVDB grid of unknown type
+        uint64_t size;// byte size of NanoVDB grid of unknown type
+        auto buffer = nanovdb::HostBuffer::createFull(size, data);
+        nanovdb::GridHandle<> gridHandle(std::move(buffer));
+   @endcode
+
+   Example that uses HostBuffer::createPool for internally managed host memory.
+   Suppose you want to read multiple grids in multiple files, but reuse the same
+   fixed sized memory buffer to both avoid memory fragmentation as well as
+   exceeding the fixed memory ceiling!
+   @code
+        auto pool = nanovdb::HostBuffer::createPool(1 << 30);// 1 GB memory pool
+        std::vector<std::string>> frames;// vector of grid names
+        for (int i=0; i<frames.size(); ++i) {
+            auto handles = nanovdb::io::readGrids(frames[i], 0, pool);// throws if grids in file exceed 1 GB
+            ...
+            pool.reset();// clears all handles and resets the memory pool for reuse
+        }
+   @endcode
+
+   Example that uses HostBuffer::createPool for externally managed host memory.
+   Note that in this example @c handles are allowed to outlive @c pool since
+   they internally store a shared pointer to the memory pool. However @c data
+   MUST outlive @c handles since the pool does not own its memory in this example.
+   @code
+        const size_t poolSize = 1 << 30;// 1 GB
+        void *data = std::malloc(size + NANOVDB_DATA_ALIGNMENT);// 1 GB pool with padding
+        void *buffer = nanovdb::alignPtr(data);// 32B aligned buffer
+        //void *buffer = std::aligned_alloc(NANOVDB_DATA_ALIGNMENT, poolSize);// in C++17
+        auto pool = nanovdb::HostBuffer::createPool(poolSize, buffer);
+        auto handles1 = nanovdb::io::readGrids("file1.nvdb", 0, pool);
+        auto handles2 = nanovdb::io::readGrids("file2.nvdb", 0, pool);
+        ....
+        std::free(data);
+        //std::free(buffer);
+   @endcode
+
+   Example that uses HostBuffer::createPool for externally managed host memory.
+   Note that in this example @c handles are allowed to outlive @c pool since
+   they internally store a shared pointer to the memory pool. However @c array
+   MUST outlive @c handles since the pool does not own its memory in this example.
+   @code
+        const size_t poolSize = 1 << 30;// 1 GB
+        std::unique_ptr<char[]> array(new char[size + NANOVDB_DATA_ALIGNMENT]);// scoped pool of 1 GB with padding
+        void *buffer = nanovdb::alignPtr(array.get());// 32B aligned buffer
+        auto pool = nanovdb::HostBuffer::createPool(poolSize, buffer);
+        auto handles = nanovdb::io::readGrids("file.nvdb", 0, pool);
+   @endcode
+*/
+
+#ifndef NANOVDB_HOSTBUFFER_H_HAS_BEEN_INCLUDED
+#define NANOVDB_HOSTBUFFER_H_HAS_BEEN_INCLUDED
+
+#include <nanovdb/NanoVDB.h>// for NANOVDB_DATA_ALIGNMENT;
+#include <stdint.h> //         for types like int32_t etc
+#include <cstdio> //           for fprintf
+#include <cstdlib> //          for std::malloc/std::realloc/std::free
+#include <memory>//            for std::make_shared
+#include <mutex>//             for std::mutex
+#include <unordered_set>//     for std::unordered_set
+#include <cassert>//           for assert
+#include <sstream>//           for std::stringstream
+#include <cstring>//           for memcpy
+
+#define checkPtr(ptr, msg) \
+    { \
+        ptrAssert((ptr), (msg), __FILE__, __LINE__); \
+    }
+
+namespace nanovdb {
+
+template<typename BufferT>
+struct BufferTraits
+{
+    static constexpr bool hasDeviceDual = false;
+};
+
+// ----------------------------> HostBuffer <--------------------------------------
+
+/// @brief This is a buffer that contains a shared or private pool
+///        to either externally or internally managed host memory.
+///
+/// @note  Terminology:
+///        Pool:   0 = buffer.size() < buffer.poolSize()
+///        Buffer: 0 < buffer.size() < buffer.poolSize()
+///        Full:   0 < buffer.size() = buffer.poolSize()
+///        Empty:  0 = buffer.size() = buffer.poolSize()
+class HostBuffer
+{
+    struct Pool;// forward declaration of private pool struct
+    std::shared_ptr<Pool> mPool;
+    uint64_t              mSize; // total number of bytes for the NanoVDB grid.
+    void*                 mData; // raw buffer for the NanoVDB grid.
+
+#if defined(DEBUG) || defined(_DEBUG)
+    static inline void ptrAssert(void* ptr, const char* msg, const char* file, int line, bool abort = true)
+    {
+        if (ptr == nullptr) {
+            fprintf(stderr, "NULL pointer error: %s %s %d\n", msg, file, line);
+            if (abort)
+                exit(1);
+        }
+        if (uint64_t(ptr) % NANOVDB_DATA_ALIGNMENT) {
+            fprintf(stderr, "Alignment pointer error: %s %s %d\n", msg, file, line);
+            if (abort)
+                exit(1);
+        }
+    }
+#else
+    static inline void ptrAssert(void*, const char*, const char*, int, bool = true)
+    {
+    }
+#endif
+
+public:
+    /// @brief Return a full buffer or an empty buffer
+    HostBuffer(uint64_t bufferSize = 0);
+
+     /// @brief Move copy-constructor
+    HostBuffer(HostBuffer&& other);
+
+    /// @brief Custom descructor
+    ~HostBuffer() { this->clear(); }
+
+    /// @brief Move copy assignment operation
+    HostBuffer& operator=(HostBuffer&& other);
+
+    /// @brief Disallow copy-construction
+    HostBuffer(const HostBuffer&) = delete;
+
+    /// @brief Disallow copy assignment operation
+    HostBuffer& operator=(const HostBuffer&) = delete;
+
+    /// @brief Return a pool buffer which satisfies: buffer.size == 0,
+    ///        buffer.poolSize() == poolSize, and buffer.data() == nullptr.
+    ///        If data==nullptr, memory for the pool will be allocated.
+    ///
+    /// @throw If poolSize is zero.
+    static HostBuffer createPool(uint64_t poolSize, void *data = nullptr);
+
+    /// @brief Return a full buffer which satisfies: buffer.size == bufferSize,
+    ///        buffer.poolSize() == bufferSize, and buffer.data() == data.
+    ///        If data==nullptr, memory for the pool will be allocated.
+    ///
+    /// @throw If bufferSize is zero.
+    static HostBuffer createFull(uint64_t bufferSize, void *data = nullptr);
+
+    /// @brief Return a buffer with @c bufferSize bytes managed by
+    ///        the specified memory @c pool. If none is provided, i.e.
+    ///        @c pool == nullptr or @c pool->poolSize() == 0, one is
+    ///        created with size @c bufferSize, i.e. a full buffer is returned.
+    ///
+    /// @throw If the specified @c pool has insufficient memory for
+    ///        the requested buffer size.
+    static HostBuffer create(uint64_t bufferSize, const HostBuffer* pool = nullptr);
+
+    /// @brief Initialize as a full buffer with the specified size. If data is NULL
+    ///        the memory is internally allocated.
+    void init(uint64_t bufferSize, void *data = nullptr);
+
+    //@{
+    /// @brief Retuns a pointer to the raw memory buffer managed by this allocator.
+    ///
+    /// @warning Note that the pointer can be NULL if the allocator was not initialized!
+    const void* data() const { return mData; }
+    void* data() { return mData; }
+    //@}
+
+    //@{
+    /// @brief Returns the size in bytes associated with this buffer.
+    uint64_t bufferSize() const { return mSize; }
+    uint64_t size() const { return this->bufferSize(); }
+    //@}
+
+    /// @brief Returns the size in bytes of the memory pool shared with this instance.
+    uint64_t poolSize() const;
+
+    /// @brief Return true if memory is managed (using std::malloc and std:free) by the
+    ///        shared pool in this buffer. Else memory is assumed to be managed externally.
+    bool isManaged() const;
+
+    //@{
+    /// @brief Returns true if this buffer has no memory associated with it
+    bool isEmpty() const { return !mPool || mSize == 0 || mData == nullptr; }
+    bool empty() const { return this->isEmpty(); }
+    //@}
+
+    /// @brief Return true if this is a pool, i.e. an empty buffer with a nonempty
+    ///        internal pool, i.e. this->size() == 0 and this->poolSize() != 0
+    bool isPool() const { return mSize == 0 && this->poolSize() > 0; }
+
+    /// @brief Return true if the pool exists, is nonempty but has no more available memory
+    bool isFull() const;
+
+    /// @brief Clear this buffer so it is empty.
+    void clear();
+
+    /// @brief Clears all existing buffers that are registered against the memory pool
+    ///        and resets the pool so it can be reused to create new buffers.
+    ///
+    /// @throw If this instance is not empty or contains no pool.
+    ///
+    /// @warning This method is not thread-safe!
+    void reset();
+
+    /// @brief Total number of bytes from the pool currently in use by buffers
+    uint64_t poolUsage() const;
+
+    /// @brief resize the pool size. It will attempt to resize the existing
+    ///        memory block, but if that fails a deep copy is performed.
+    ///        If @c data is not NULL it will be used as new externally
+    ///        managed memory for the pool. All registered buffers are
+    ///        updated so GridHandle::grid might return a new address (if
+    ///        deep copy was performed).
+    ///
+    /// @note  This method can be use to resize the memory pool and even
+    ///        change it from internally to externally managed memory or vice versa.
+    ///
+    /// @throw if @c poolSize is less than this->poolUsage() the used memory
+    ///        or allocations fail.
+    void resizePool(uint64_t poolSize, void *data = nullptr);
+
+}; // HostBuffer class
+
+// --------------------------> Implementation of HostBuffer::Pool <------------------------------------
+
+// This is private struct of HostBuffer so you can safely ignore the API
+struct HostBuffer::Pool
+{
+    using HashTableT = std::unordered_set<HostBuffer*>;
+    std::mutex mMutex; // mutex for updating mRegister and mFree
+    HashTableT mRegister;
+    void      *mData, *mFree;
+    uint64_t   mSize, mPadding;
+    bool       mManaged;
+
+    /// @brief External memory ctor
+    Pool(uint64_t size = 0, void* data = nullptr)
+        : mData(data)
+        , mFree(mData)
+        , mSize(size)
+        , mPadding(0)
+        , mManaged(data == nullptr)
+    {
+        if (mManaged) {
+            mData = Pool::alloc(mSize);
+            if (mData == nullptr) throw std::runtime_error("Pool::Pool malloc failed");
+        }
+        mPadding = alignmentPadding(mData);
+        if (!mManaged && mPadding != 0) {
+            throw std::runtime_error("Pool::Pool: external memory buffer is not aligned to " +
+                                     std::to_string(NANOVDB_DATA_ALIGNMENT) +
+                                     " bytes.\nHint: use nanovdb::alignPtr or std::aligned_alloc (C++17 only)");
+        }
+        mFree = util::PtrAdd(mData, mPadding);
+    }
+
+    /// @brief Custom destructor
+    ~Pool()
+    {
+        assert(mRegister.empty());
+        if (mManaged) std::free(mData);
+    }
+
+    /// @brief Disallow copy-construction
+    Pool(const Pool&) = delete;
+
+    /// @brief Disallow move-construction
+    Pool(const Pool&&) = delete;
+
+    /// @brief Disallow copy assignment operation
+    Pool& operator=(const Pool&) = delete;
+
+    /// @brief Disallow move assignment operation
+    Pool& operator=(const Pool&&) = delete;
+
+    /// @brief Return the total number of bytes used from this Pool by buffers
+    uint64_t usage() const { return util::PtrDiff(mFree, mData) - mPadding; }
+
+    /// @brief Allocate a buffer of the specified size and add it to the register
+    void add(HostBuffer* buffer, uint64_t size)
+    {
+        void *alignedFree = util::PtrAdd(mFree, alignmentPadding(mFree));
+
+        if (util::PtrAdd(alignedFree, size) > util::PtrAdd(mData, mPadding + mSize)) {
+            std::stringstream ss;
+            ss << "HostBuffer::Pool: insufficient memory\n"
+               << "\tA buffer requested " << size << " bytes with " << NANOVDB_DATA_ALIGNMENT
+               << "-bytes alignment from a pool with "
+               << mSize << " bytes of which\n\t" << (util::PtrDiff(alignedFree, mData) - mPadding)
+               << " bytes are used by " << mRegister.size() << " other buffer(s). "
+               << "Pool is " << (mManaged ? "internally" : "externally") << " managed.\n";
+            //std::cerr << ss.str();
+            throw std::runtime_error(ss.str());
+        }
+        buffer->mSize = size;
+        const std::lock_guard<std::mutex> lock(mMutex);
+        mRegister.insert(buffer);
+        buffer->mData = alignedFree;
+        mFree = util::PtrAdd(alignedFree, size);
+    }
+
+    /// @brief Remove the specified buffer from the register
+    void remove(HostBuffer *buffer)
+    {
+        const std::lock_guard<std::mutex> lock(mMutex);
+        mRegister.erase(buffer);
+    }
+
+    /// @brief Replaces buffer1 with buffer2 in the register
+    void replace(HostBuffer *buffer1, HostBuffer *buffer2)
+    {
+        const std::lock_guard<std::mutex> lock(mMutex);
+        mRegister.erase( buffer1);
+        mRegister.insert(buffer2);
+    }
+
+    /// @brief Reset the register and all its buffers
+    void reset()
+    {
+        for (HostBuffer *buffer : mRegister) {
+            buffer->mPool.reset();
+            buffer->mSize = 0;
+            buffer->mData = nullptr;
+        }
+        mRegister.clear();
+        mFree = util::PtrAdd(mData, mPadding);
+    }
+
+    /// @brief Resize this Pool and update registered buffers as needed. If data is no NULL
+    ///        it is used as externally managed memory.
+    void resize(uint64_t size, void *data = nullptr)
+    {
+        const uint64_t memUsage = this->usage();
+
+        const bool managed = (data == nullptr);
+
+        if (!managed && alignmentPadding(data) != 0) {
+            throw std::runtime_error("Pool::resize: external memory buffer is not aligned to " +
+                                     std::to_string(NANOVDB_DATA_ALIGNMENT) + " bytes");
+        }
+
+        if (memUsage > size) {
+            throw std::runtime_error("Pool::resize: insufficient memory");
+        }
+
+        uint64_t padding = 0;
+        if (mManaged && managed && size != mSize) { // managed -> managed
+            padding = mPadding;
+            data = Pool::realloc(mData, memUsage, size, padding); // performs both copy and free of mData
+        } else if (!mManaged && managed) { // un-managed -> managed
+            data = Pool::alloc(size);
+            padding = alignmentPadding(data);
+        }
+
+        if (data == nullptr) {
+            throw std::runtime_error("Pool::resize: allocation failed");
+        } else if (data != mData) {
+            void* paddedData = util::PtrAdd(data, padding);
+
+            if (!(mManaged && managed)) { // no need to copy if managed -> managed
+                memcpy(paddedData, util::PtrAdd(mData, mPadding), memUsage);
+            }
+
+            for (HostBuffer* buffer : mRegister) { // update registered buffers
+                //buffer->mData = paddedData + ptrdiff_t(buffer->mData - (mData + mPadding));
+                buffer->mData = util::PtrAdd(paddedData, util::PtrDiff(buffer->mData, util::PtrAdd(mData, mPadding)));
+            }
+            mFree = util::PtrAdd(paddedData, memUsage); // update the free pointer
+            if (mManaged && !managed) {// only free if managed -> un-managed
+                std::free(mData);
+            }
+
+            mData = data;
+            mPadding = padding;
+        }
+        mSize    = size;
+        mManaged = managed;
+    }
+    /// @brief Return true is all the memory in this pool is in use.
+    bool isFull() const
+    {
+        assert(mFree <= util::PtrAdd(mData, mPadding + mSize));
+        return mSize > 0 ? mFree == util::PtrAdd(mData, mPadding + mSize) : false;
+    }
+
+private:
+
+    static void* alloc(uint64_t size)
+    {
+//#if (__cplusplus >= 201703L)
+//    return std::aligned_alloc(NANOVDB_DATA_ALIGNMENT, size);//C++17 or newer
+//#else
+    // make sure we alloc enough space to align the result
+    return std::malloc(size + NANOVDB_DATA_ALIGNMENT);
+//#endif
+    }
+
+    static void* realloc(void* const origData,
+                         uint64_t    origSize,
+                         uint64_t    desiredSize,
+                         uint64_t&   padding)
+    {
+        // make sure we alloc enough space to align the result
+        void* data = std::realloc(origData, desiredSize + NANOVDB_DATA_ALIGNMENT);
+
+        if (data != nullptr && data != origData) {
+            uint64_t newPadding = alignmentPadding(data);
+            // Number of padding bytes may have changed -- move data if that's the case
+            if (newPadding != padding) {
+                // Realloc should not happen when shrinking down buffer, but let's be safe
+                std::memmove(util::PtrAdd(data, newPadding),
+                             util::PtrAdd(data, padding),
+                             math::Min(origSize, desiredSize));
+                padding = newPadding;
+            }
+        }
+
+        return data;
+    }
+
+};// struct HostBuffer::Pool
+
+// --------------------------> Implementation of HostBuffer <------------------------------------
+
+inline HostBuffer::HostBuffer(uint64_t size) : mPool(nullptr), mSize(size), mData(nullptr)
+{
+    if (size>0) {
+        mPool = std::make_shared<Pool>(size);
+        mData = mPool->mFree;
+        mPool->mRegister.insert(this);
+        mPool->mFree = util::PtrAdd(mPool->mFree, size);
+    }
+}
+
+inline HostBuffer::HostBuffer(HostBuffer&& other) : mPool(other.mPool), mSize(other.mSize), mData(other.mData)
+{
+    if (mPool && mSize != 0) {
+        mPool->replace(&other, this);
+    }
+    other.mPool.reset();
+    other.mSize = 0;
+    other.mData = nullptr;
+}
+
+inline void HostBuffer::init(uint64_t bufferSize, void *data)
+{
+    if (bufferSize == 0) {
+        throw std::runtime_error("HostBuffer: invalid buffer size");
+    }
+    if (mPool) {
+        mPool.reset();
+    }
+    if (!mPool || mPool->mSize != bufferSize) {
+        mPool = std::make_shared<Pool>(bufferSize, data);
+    }
+    mPool->add(this, bufferSize);
+}
+
+inline HostBuffer& HostBuffer::operator=(HostBuffer&& other)
+{
+    if (mPool) {
+        mPool->remove(this);
+    }
+    mPool = other.mPool;
+    mSize = other.mSize;
+    mData = other.mData;
+    if (mPool && mSize != 0) {
+        mPool->replace(&other, this);
+    }
+    other.mPool.reset();
+    other.mSize = 0;
+    other.mData = nullptr;
+    return *this;
+}
+
+inline uint64_t HostBuffer::poolSize() const
+{
+    return mPool ? mPool->mSize : 0u;
+}
+
+inline uint64_t HostBuffer::poolUsage() const
+{
+    return mPool ? mPool->usage(): 0u;
+}
+
+inline bool HostBuffer::isManaged() const
+{
+    return mPool ? mPool->mManaged : false;
+}
+
+inline bool HostBuffer::isFull() const
+{
+    return mPool ? mPool->isFull() : false;
+}
+
+inline HostBuffer HostBuffer::createPool(uint64_t poolSize, void *data)
+{
+    if (poolSize == 0) {
+        throw std::runtime_error("HostBuffer: invalid pool size");
+    }
+    HostBuffer buffer;
+    buffer.mPool = std::make_shared<Pool>(poolSize, data);
+    // note the buffer is NOT registered by its pool since it is not using its memory
+    buffer.mSize = 0;
+    buffer.mData = nullptr;
+    return buffer;
+}
+
+inline HostBuffer HostBuffer::createFull(uint64_t bufferSize, void *data)
+{
+    if (bufferSize == 0) {
+        throw std::runtime_error("HostBuffer: invalid buffer size");
+    }
+    HostBuffer buffer;
+    buffer.mPool = std::make_shared<Pool>(bufferSize, data);
+    buffer.mPool->add(&buffer, bufferSize);
+    return buffer;
+}
+
+inline HostBuffer HostBuffer::create(uint64_t bufferSize, const HostBuffer* pool)
+{
+    HostBuffer buffer;
+    if (pool == nullptr || !pool->mPool) {
+        buffer.mPool = std::make_shared<Pool>(bufferSize);
+    } else {
+       buffer.mPool = pool->mPool;
+    }
+    buffer.mPool->add(&buffer, bufferSize);
+    return buffer;
+}
+
+inline void HostBuffer::clear()
+{
+    if (mPool) {// remove self from the buffer register in the pool
+        mPool->remove(this);
+    }
+    mPool.reset();
+    mSize = 0;
+    mData = nullptr;
+}
+
+inline void HostBuffer::reset()
+{
+    if (this->size()>0) {
+        throw std::runtime_error("HostBuffer: only empty buffers can call reset");
+    }
+    if (!mPool) {
+        throw std::runtime_error("HostBuffer: this buffer contains no pool to reset");
+    }
+    mPool->reset();
+}
+
+inline void HostBuffer::resizePool(uint64_t size, void *data)
+{
+    if (!mPool) {
+        throw std::runtime_error("HostBuffer: this buffer contains no pool to resize");
+    }
+    mPool->resize(size, data);
+}
+
+} // namespace nanovdb
+
+#endif // end of NANOVDB_HOSTBUFFER_H_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/NanoVDB.h b/nanovdb/nanovdb/NanoVDB.h
index 2e37c46ac0..5e912a7868 100644
--- a/nanovdb/nanovdb/NanoVDB.h
+++ b/nanovdb/nanovdb/NanoVDB.h
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: MPL-2.0
 
 /*!
-    \file   NanoVDB.h
+    \file   nanovdb/NanoVDB.h
 
     \author Ken Museth
 
@@ -118,20 +118,31 @@
 #ifndef NANOVDB_NANOVDB_H_HAS_BEEN_INCLUDED
 #define NANOVDB_NANOVDB_H_HAS_BEEN_INCLUDED
 
-// NANOVDB_MAGIC_NUMBER is currently used for both grids and files (starting with v32.6.0)
-// NANOVDB_MAGIC_GRID will soon be used exclusively for grids
+// The following two header files are the only mandatory dependencies
+#include <nanovdb/util/Util.h>// for __hostdev__ and lots of other utility functions
+#include <nanovdb/math/Math.h>// for Coord, BBox, Vec3, Vec4 etc
+
+// Do not change this value! 32 byte alignment is fixed in NanoVDB
+#define NANOVDB_DATA_ALIGNMENT 32
+
+// NANOVDB_MAGIC_NUMB is currently used for both grids and files (starting with v32.6.0)
+// NANOVDB_MAGIC_GRID will soon be used exclusively for grids (serialized to a single buffer)
 // NANOVDB_MAGIC_FILE will soon be used exclusively for files
 // NANOVDB_MAGIC_NODE will soon be used exclusively for NodeManager
+// NANOVDB_MAGIC_FRAG will soon be used exclusively for a fragmented grid, i.e. a grid that is not serialized
 //                              | : 0 in 30 corresponds to 0 in NanoVDB0
-#define NANOVDB_MAGIC_NUMBER 0x304244566f6e614eUL // "NanoVDB0" in hex - little endian (uint64_t)
-#define NANOVDB_MAGIC_GRID   0x314244566f6e614eUL // "NanoVDB1" in hex - little endian (uint64_t)
-#define NANOVDB_MAGIC_FILE   0x324244566f6e614eUL // "NanoVDB2" in hex - little endian (uint64_t)
-#define NANOVDB_MAGIC_NODE   0x334244566f6e614eUL // "NanoVDB3" in hex - little endian (uint64_t)
-#define NANOVDB_MAGIC_MASK   0x00FFFFFFFFFFFFFFUL // use this mask to remove the number
+#define NANOVDB_MAGIC_NUMB  0x304244566f6e614eUL // "NanoVDB0" in hex - little endian (uint64_t)
+#define NANOVDB_MAGIC_GRID  0x314244566f6e614eUL // "NanoVDB1" in hex - little endian (uint64_t)
+#define NANOVDB_MAGIC_FILE  0x324244566f6e614eUL // "NanoVDB2" in hex - little endian (uint64_t)
+#define NANOVDB_MAGIC_NODE  0x334244566f6e614eUL // "NanoVDB3" in hex - little endian (uint64_t)
+#define NANOVDB_MAGIC_FRAG  0x344244566f6e614eUL // "NanoVDB4" in hex - little endian (uint64_t)
+#define NANOVDB_MAGIC_MASK  0x00FFFFFFFFFFFFFFUL // use this mask to remove the number
+
+//#define NANOVDB_MAGIC_NUMBER 0x304244566f6e614eUL
 //#define NANOVDB_USE_NEW_MAGIC_NUMBERS// used to enable use of the new magic numbers described above
 
 #define NANOVDB_MAJOR_VERSION_NUMBER 32 // reflects changes to the ABI and hence also the file format
-#define NANOVDB_MINOR_VERSION_NUMBER 6 //  reflects changes to the API but not ABI
+#define NANOVDB_MINOR_VERSION_NUMBER 7 //  reflects changes to the API but not ABI
 #define NANOVDB_PATCH_VERSION_NUMBER 0 //  reflects changes that does not affect the ABI or API
 
 #define TBB_SUPPRESS_DEPRECATED_MESSAGES 1
@@ -150,101 +161,11 @@
 
 #define NANOVDB_FPN_BRANCHLESS
 
-// Do not change this value! 32 byte alignment is fixed in NanoVDB
-#define NANOVDB_DATA_ALIGNMENT 32
-
 #if !defined(NANOVDB_ALIGN)
 #define NANOVDB_ALIGN(n) alignas(n)
 #endif // !defined(NANOVDB_ALIGN)
 
-#ifdef __CUDACC_RTC__
-
-typedef signed char        int8_t;
-typedef short              int16_t;
-typedef int                int32_t;
-typedef long long          int64_t;
-typedef unsigned char      uint8_t;
-typedef unsigned int       uint32_t;
-typedef unsigned short     uint16_t;
-typedef unsigned long long uint64_t;
-
-#define NANOVDB_ASSERT(x)
-
-#define UINT64_C(x)  (x ## ULL)
-
-#else // !__CUDACC_RTC__
-
-#include <stdlib.h> //    for abs in clang7
-#include <stdint.h> //    for types like int32_t etc
-#include <stddef.h> //    for size_t type
-#include <cassert> //     for assert
-#include <cstdio> //      for snprintf
-#include <cmath> //       for sqrt and fma
-#include <limits> //      for numeric_limits
-#include <utility>//      for std::move
-#ifdef NANOVDB_USE_IOSTREAMS
-#include <fstream>//      for read/writeUncompressedGrids
-#endif
-// All asserts can be disabled here, even for debug builds
-#if 1
-#define NANOVDB_ASSERT(x) assert(x)
-#else
-#define NANOVDB_ASSERT(x)
-#endif
-
-#if defined(NANOVDB_USE_INTRINSICS) && defined(_MSC_VER)
-#include <intrin.h>
-#pragma intrinsic(_BitScanReverse)
-#pragma intrinsic(_BitScanForward)
-#pragma intrinsic(_BitScanReverse64)
-#pragma intrinsic(_BitScanForward64)
-#endif
-
-#endif // __CUDACC_RTC__
-
-#if defined(__CUDACC__) || defined(__HIP__)
-// Only define __hostdev__ when using NVIDIA CUDA or HIP compilers
-#ifndef __hostdev__
-#define __hostdev__ __host__ __device__ // Runs on the CPU and GPU, called from the CPU or the GPU
-#endif
-#else
-// Dummy definitions of macros only defined by CUDA and HIP compilers
-#ifndef __hostdev__
-#define __hostdev__ // Runs on the CPU and GPU, called from the CPU or the GPU
-#endif
-#ifndef __global__
-#define __global__ // Runs on the GPU, called from the CPU or the GPU
-#endif
-#ifndef __device__
-#define __device__ // Runs on the GPU, called from the GPU
-#endif
-#ifndef __host__
-#define __host__ // Runs on the CPU, called from the CPU
-#endif
-
-#endif // if defined(__CUDACC__) || defined(__HIP__)
-
-// The following macro will suppress annoying warnings when nvcc
-// compiles functions that call (host) intrinsics (which is perfectly valid)
-#if defined(_MSC_VER) && defined(__CUDACC__)
-#define NANOVDB_HOSTDEV_DISABLE_WARNING __pragma("hd_warning_disable")
-#elif defined(__GNUC__) && defined(__CUDACC__)
-#define NANOVDB_HOSTDEV_DISABLE_WARNING _Pragma("hd_warning_disable")
-#else
-#define NANOVDB_HOSTDEV_DISABLE_WARNING
-#endif
-
-// Define compiler warnings that work with all compilers
-//#if defined(_MSC_VER)
-//#define NANO_WARNING(msg) _pragma("message" #msg)
-//#else
-//#define NANO_WARNING(msg) _Pragma("message" #msg)
-//#endif
-
-// A portable implementation of offsetof - unfortunately it doesn't work with static_assert
-#define NANOVDB_OFFSETOF(CLASS, MEMBER) ((int)(size_t)((char*)&((CLASS*)0)->MEMBER - (char*)0))
-
-namespace nanovdb {
+namespace nanovdb {// =================================================================
 
 // --------------------------> Build types <------------------------------------
 
@@ -283,13 +204,17 @@ class Point{};
 
 // --------------------------> GridType <------------------------------------
 
+/// @brief return the number of characters (including null termination) required to convert enum type to a string
+template <class EnumT>
+__hostdev__ inline constexpr uint32_t strlen(){return (uint32_t)EnumT::StrLen - (uint32_t)EnumT::End;}
+
 /// @brief List of types that are currently supported by NanoVDB
 ///
 /// @note To expand on this list do:
 ///       1) Add the new type between Unknown and End in the enum below
 ///       2) Add the new type to OpenToNanoVDB::processGrid that maps OpenVDB types to GridType
 ///       3) Verify that the ConvertTrait in NanoToOpenVDB.h works correctly with the new type
-///       4) Add the new type to mapToGridType (defined below) that maps NanoVDB types to GridType
+///       4) Add the new type to toGridType (defined below) that maps NanoVDB types to GridType
 ///       5) Add the new type to toStr (defined below)
 enum class GridType : uint32_t { Unknown = 0, //  unknown value type - should rarely be used
                                  Float = 1, //  single precision floating point value
@@ -317,21 +242,47 @@ enum class GridType : uint32_t { Unknown = 0, //  unknown value type - should ra
                                  PointIndex = 23, // voxels encode indices to co-located points
                                  Vec3u8 = 24, // 8bit quantization of floating point 3D vector (only as blind data)
                                  Vec3u16 = 25, // 16bit quantization of floating point 3D vector (only as blind data)
-                                 End = 26 }; // should never be used
+                                 UInt8 = 26, // 8 bit unsigned integer values (eg 0 -> 255 gray scale)
+                                 End = 27,// total number of types in this enum (excluding StrLen since it's not a type)
+                                 StrLen = End + 12};// this entry is used to determine the minimum size of c-string
 
-#ifndef __CUDACC_RTC__
 /// @brief Maps a GridType to a c-string
-/// @param gridType GridType to be mapped to a string
+/// @param dst destination string of size 12 or larger
+/// @param gridType GridType enum to be mapped to a string
 /// @return Retuns a c-string used to describe a GridType
-inline const char* toStr(GridType gridType)
-{
-    static const char* LUT[] = {"?", "float", "double", "int16", "int32", "int64", "Vec3f", "Vec3d", "Mask", "Half",
-                                "uint32", "bool", "RGBA8", "Float4", "Float8", "Float16", "FloatN", "Vec4f", "Vec4d",
-                                "Index", "OnIndex", "IndexMask", "OnIndexMask", "PointIndex", "Vec3u8", "Vec3u16", "End"};
-    static_assert(sizeof(LUT) / sizeof(char*) - 1 == int(GridType::End), "Unexpected size of LUT");
-    return LUT[static_cast<int>(gridType)];
+__hostdev__ inline char* toStr(char *dst, GridType gridType)
+{
+    switch (gridType){
+        case GridType::Unknown:     return util::strcpy(dst, "?");
+        case GridType::Float:       return util::strcpy(dst, "float");
+        case GridType::Double:      return util::strcpy(dst, "double");
+        case GridType::Int16:       return util::strcpy(dst, "int16");
+        case GridType::Int32:       return util::strcpy(dst, "int32");
+        case GridType::Int64:       return util::strcpy(dst, "int64");
+        case GridType::Vec3f:       return util::strcpy(dst, "Vec3f");
+        case GridType::Vec3d:       return util::strcpy(dst, "Vec3d");
+        case GridType::Mask:        return util::strcpy(dst, "Mask");
+        case GridType::Half:        return util::strcpy(dst, "Half");
+        case GridType::UInt32:      return util::strcpy(dst, "uint32");
+        case GridType::Boolean:     return util::strcpy(dst, "bool");
+        case GridType::RGBA8:       return util::strcpy(dst, "RGBA8");
+        case GridType::Fp4:         return util::strcpy(dst, "Float4");
+        case GridType::Fp8:         return util::strcpy(dst, "Float8");
+        case GridType::Fp16:        return util::strcpy(dst, "Float16");
+        case GridType::FpN:         return util::strcpy(dst, "FloatN");
+        case GridType::Vec4f:       return util::strcpy(dst, "Vec4f");
+        case GridType::Vec4d:       return util::strcpy(dst, "Vec4d");
+        case GridType::Index:       return util::strcpy(dst, "Index");
+        case GridType::OnIndex:     return util::strcpy(dst, "OnIndex");
+        case GridType::IndexMask:   return util::strcpy(dst, "IndexMask");
+        case GridType::OnIndexMask: return util::strcpy(dst, "OnIndexMask");
+        case GridType::PointIndex:  return util::strcpy(dst, "PointIndex");
+        case GridType::Vec3u8:      return util::strcpy(dst, "Vec3u8");
+        case GridType::Vec3u16:     return util::strcpy(dst, "Vec3u16");
+        case GridType::UInt8:       return util::strcpy(dst, "uint8");
+        default:                    return util::strcpy(dst, "End");
+    }
 }
-#endif
 
 // --------------------------> GridClass <------------------------------------
 
@@ -346,17 +297,29 @@ enum class GridClass : uint32_t { Unknown = 0,
                                   VoxelVolume = 7, // volume of geometric cubes, e.g. colors cubes in Minecraft
                                   IndexGrid = 8, // grid whose values are offsets, e.g. into an external array
                                   TensorGrid = 9, // Index grid for indexing learnable tensor features
-                                  End = 10 };
+                                  End = 10,// total number of types in this enum (excluding StrLen since it's not a type)
+                                  StrLen = End + 7};// this entry is used to determine the minimum size of c-string
+
 
-#ifndef __CUDACC_RTC__
 /// @brief Retuns a c-string used to describe a GridClass
-inline const char* toStr(GridClass gridClass)
-{
-    static const char* LUT[] = {"?", "SDF", "FOG", "MAC", "PNTIDX", "PNTDAT", "TOPO", "VOX", "INDEX", "TENSOR", "END"};
-    static_assert(sizeof(LUT) / sizeof(char*) - 1 == int(GridClass::End), "Unexpected size of LUT");
-    return LUT[static_cast<int>(gridClass)];
+/// @param dst destination string of size 7 or larger
+/// @param gridClass GridClass enum to be converted to a string
+__hostdev__ inline char* toStr(char *dst, GridClass gridClass)
+{
+    switch (gridClass){
+        case GridClass::Unknown:     return util::strcpy(dst, "?");
+        case GridClass::LevelSet:    return util::strcpy(dst, "SDF");
+        case GridClass::FogVolume:   return util::strcpy(dst, "FOG");
+        case GridClass::Staggered:   return util::strcpy(dst, "MAC");
+        case GridClass::PointIndex:  return util::strcpy(dst, "PNTIDX");
+        case GridClass::PointData:   return util::strcpy(dst, "PNTDAT");
+        case GridClass::Topology:    return util::strcpy(dst, "TOPO");
+        case GridClass::VoxelVolume: return util::strcpy(dst, "VOX");
+        case GridClass::IndexGrid:   return util::strcpy(dst, "INDEX");
+        case GridClass::TensorGrid:  return util::strcpy(dst, "TENSOR");
+        default:                     return util::strcpy(dst, "END");
+    }
 }
-#endif
 
 // --------------------------> GridFlags <------------------------------------
 
@@ -369,23 +332,83 @@ enum class GridFlags : uint32_t {
     HasStdDeviation = 1 << 4, // nodes contain standard deviations of active values
     IsBreadthFirst = 1 << 5, // nodes are typically arranged breadth-first in memory
     End = 1 << 6, // use End - 1 as a mask for the 5 lower bit flags
+    StrLen = End + 23,// this entry is used to determine the minimum size of c-string
 };
 
-#ifndef __CUDACC_RTC__
 /// @brief Retuns a c-string used to describe a GridFlags
-inline const char* toStr(GridFlags gridFlags)
-{
-    static const char* LUT[] = {"has long grid name",
-                                "has bbox",
-                                "has min/max",
-                                "has average",
-                                "has standard deviation",
-                                "is breadth-first",
-                                "end"};
-    static_assert(1 << (sizeof(LUT) / sizeof(char*) - 1) == int(GridFlags::End), "Unexpected size of LUT");
-    return LUT[static_cast<int>(gridFlags)];
+/// @param dst destination string of size 23 or larger
+/// @param gridFlags GridFlags enum to be converted to a string
+__hostdev__ inline const char* toStr(char *dst, GridFlags gridFlags)
+{
+    switch (gridFlags){
+        case GridFlags::HasLongGridName: return util::strcpy(dst, "has long grid name");
+        case GridFlags::HasBBox:         return util::strcpy(dst, "has bbox");
+        case GridFlags::HasMinMax:       return util::strcpy(dst, "has min/max");
+        case GridFlags::HasAverage:      return util::strcpy(dst, "has average");
+        case GridFlags::HasStdDeviation: return util::strcpy(dst, "has standard deviation");
+        case GridFlags::IsBreadthFirst:  return util::strcpy(dst, "is breadth-first");
+        default:                         return util::strcpy(dst, "end");
+    }
+}
+
+// --------------------------> MagicType <------------------------------------
+
+/// @brief Enums used to identify magic numbers recognized by NanoVDB
+enum class MagicType : uint32_t { Unknown  = 0,// first 64 bits are neither of the cases below
+                                  OpenVDB  = 1,// first 32 bits = 0x56444220UL
+                                  NanoVDB  = 2,// first 64 bits = NANOVDB_MAGIC_NUMB
+                                  NanoGrid = 3,// first 64 bits = NANOVDB_MAGIC_GRID
+                                  NanoFile = 4,// first 64 bits = NANOVDB_MAGIC_FILE
+                                  NanoNode = 5,// first 64 bits = NANOVDB_MAGIC_NODE
+                                  NanoFrag = 6,// first 64 bits = NANOVDB_MAGIC_FRAG
+                                  End      = 7,
+                                  StrLen   = End + 25};// this entry is used to determine the minimum size of c-string
+
+/// @brief maps 64 bits of magic number to enum
+__hostdev__ inline MagicType toMagic(uint64_t magic)
+{
+    switch (magic){
+        case NANOVDB_MAGIC_NUMB:   return MagicType::NanoVDB;
+        case NANOVDB_MAGIC_GRID:   return MagicType::NanoGrid;
+        case NANOVDB_MAGIC_FILE:   return MagicType::NanoFile;
+        case NANOVDB_MAGIC_NODE:   return MagicType::NanoNode;
+        case NANOVDB_MAGIC_FRAG:   return MagicType::NanoFrag;
+        default: return (magic & ~uint32_t(0)) == 0x56444220UL ? MagicType::OpenVDB : MagicType::Unknown;
+    }
+}
+
+/// @brief print 64-bit magic number to string
+/// @param dst destination string of size 25 or larger
+/// @param magic 64 bit magic number to be printed
+/// @return return destination string @c dst
+__hostdev__ inline char* toStr(char *dst, MagicType magic)
+{
+    switch (magic){
+        case MagicType::Unknown:  return util::strcpy(dst, "unknown");
+        case MagicType::NanoVDB:  return util::strcpy(dst, "nanovdb");
+        case MagicType::NanoGrid: return util::strcpy(dst, "nanovdb::Grid");
+        case MagicType::NanoFile: return util::strcpy(dst, "nanovdb::File");
+        case MagicType::NanoNode: return util::strcpy(dst, "nanovdb::NodeManager");
+        case MagicType::NanoFrag: return util::strcpy(dst, "fragmented nanovdb::Grid");
+        case MagicType::OpenVDB:  return util::strcpy(dst, "openvdb");
+        default:                  return util::strcpy(dst, "end");
+    }
 }
-#endif
+
+// --------------------------> PointType enums <------------------------------------
+
+// Define the type used when the points are encoded as blind data in the output grid
+enum class PointType : uint32_t { Disable = 0,// no point information e.g. when BuildT != Point
+                                  PointID = 1,// linear index of type uint32_t to points
+                                  World64 = 2,// Vec3d in world space
+                                  World32 = 3,// Vec3f in world space
+                                  Grid64  = 4,// Vec3d in grid space
+                                  Grid32  = 5,// Vec3f in grid space
+                                  Voxel32 = 6,// Vec3f in voxel space
+                                  Voxel16 = 7,// Vec3u16 in voxel space
+                                  Voxel8  = 8,// Vec3u8 in voxel space
+                                  Default = 9,// output matches input, i.e. Vec3d or Vec3f in world space
+                                  End     =10 };
 
 // --------------------------> GridBlindData enums <------------------------------------
 
@@ -410,37 +433,6 @@ enum class GridBlindDataSemantic : uint32_t { Unknown = 0,
                                               VoxelCoords = 9, // 3D coordinates in voxel space, e.g. (0.2, 0.0, 0.7)
                                               End = 10 };
 
-// --------------------------> is_same <------------------------------------
-
-/// @brief C++11 implementation of std::is_same
-/// @note When more than two arguments are provided value = T0==T1 || T0==T2 || ...
-template<typename T0, typename T1, typename ...T>
-struct is_same
-{
-    static constexpr bool value = is_same<T0, T1>::value || is_same<T0, T...>::value;
-};
-
-template<typename T0, typename T1>
-struct is_same<T0, T1>
-{
-    static constexpr bool value = false;
-};
-
-template<typename T>
-struct is_same<T, T>
-{
-    static constexpr bool value = true;
-};
-
-// --------------------------> is_floating_point <------------------------------------
-
-/// @brief C++11 implementation of std::is_floating_point
-template<typename T>
-struct is_floating_point
-{
-    static constexpr bool value = is_same<T, float, double>::value;
-};
-
 // --------------------------> BuildTraits <------------------------------------
 
 /// @brief Define static boolean tests for template build types
@@ -448,179 +440,20 @@ template<typename T>
 struct BuildTraits
 {
     // check if T is an index type
-    static constexpr bool is_index     = is_same<T, ValueIndex, ValueIndexMask, ValueOnIndex, ValueOnIndexMask>::value;
-    static constexpr bool is_onindex   = is_same<T, ValueOnIndex, ValueOnIndexMask>::value;
-    static constexpr bool is_offindex  = is_same<T, ValueIndex, ValueIndexMask>::value;
-    static constexpr bool is_indexmask = is_same<T, ValueIndexMask, ValueOnIndexMask>::value;
+    static constexpr bool is_index     = util::is_same<T, ValueIndex, ValueIndexMask, ValueOnIndex, ValueOnIndexMask>::value;
+    static constexpr bool is_onindex   = util::is_same<T, ValueOnIndex, ValueOnIndexMask>::value;
+    static constexpr bool is_offindex  = util::is_same<T, ValueIndex, ValueIndexMask>::value;
+    static constexpr bool is_indexmask = util::is_same<T, ValueIndexMask, ValueOnIndexMask>::value;
     // check if T is a compressed float type with fixed bit precision
-    static constexpr bool is_FpX = is_same<T, Fp4, Fp8, Fp16>::value;
+    static constexpr bool is_FpX = util::is_same<T, Fp4, Fp8, Fp16>::value;
     // check if T is a compressed float type with fixed or variable bit precision
-    static constexpr bool is_Fp = is_same<T, Fp4, Fp8, Fp16, FpN>::value;
+    static constexpr bool is_Fp = util::is_same<T, Fp4, Fp8, Fp16, FpN>::value;
     // check if T is a POD float type, i.e float or double
-    static constexpr bool is_float = is_floating_point<T>::value;
+    static constexpr bool is_float = util::is_floating_point<T>::value;
     // check if T is a template specialization of LeafData<T>, i.e. has T mValues[512]
-    static constexpr bool is_special = is_index || is_Fp || is_same<T, Point, bool, ValueMask>::value;
+    static constexpr bool is_special = is_index || is_Fp || util::is_same<T, Point, bool, ValueMask>::value;
 }; // BuildTraits
 
-// --------------------------> enable_if <------------------------------------
-
-/// @brief C++11 implementation of std::enable_if
-template <bool, typename T = void>
-struct enable_if
-{
-};
-
-template <typename T>
-struct enable_if<true, T>
-{
-    using type = T;
-};
-
-// --------------------------> disable_if <------------------------------------
-
-template<bool, typename T = void>
-struct disable_if
-{
-    typedef T type;
-};
-
-template<typename T>
-struct disable_if<true, T>
-{
-};
-
-// --------------------------> is_const <------------------------------------
-
-template<typename T>
-struct is_const
-{
-    static constexpr bool value = false;
-};
-
-template<typename T>
-struct is_const<const T>
-{
-    static constexpr bool value = true;
-};
-
-// --------------------------> is_pointer <------------------------------------
-
-/// @brief Trait used to identify template parameter that are pointers
-/// @tparam T Template parameter to be tested
-template<class T>
-struct is_pointer
-{
-    static constexpr bool value = false;
-};
-
-/// @brief Template specialization of non-const pointers
-/// @tparam T Template parameter to be tested
-template<class T>
-struct is_pointer<T*>
-{
-    static constexpr bool value = true;
-};
-
-/// @brief Template specialization of const pointers
-/// @tparam T Template parameter to be tested
-template<class T>
-struct is_pointer<const T*>
-{
-    static constexpr bool value = true;
-};
-
-// --------------------------> remove_const <------------------------------------
-
-/// @brief Trait use to const from type. Default implementation is just a pass-through
-/// @tparam T Type
-/// @details remove_pointer<float>::type = float
-template<typename T>
-struct remove_const
-{
-    using type = T;
-};
-
-/// @brief Template specialization of trait class use to remove const qualifier type from a type
-/// @tparam T Type of the const type
-/// @details remove_pointer<const float>::type = float
-template<typename T>
-struct remove_const<const T>
-{
-    using type = T;
-};
-
-// --------------------------> remove_reference <------------------------------------
-
-/// @brief Trait use to remove reference, i.e. "&", qualifier from a type. Default implementation is just a pass-through
-/// @tparam T Type
-/// @details remove_pointer<float>::type = float
-template <typename T>
-struct remove_reference {using type = T;};
-
-/// @brief Template specialization of trait class use to remove reference, i.e. "&", qualifier from a type
-/// @tparam T Type of the reference
-/// @details remove_pointer<float&>::type = float
-template <typename T>
-struct remove_reference<T&> {using type = T;};
-
-// --------------------------> remove_pointer <------------------------------------
-
-/// @brief Trait use to remove pointer, i.e. "*", qualifier from a type. Default implementation is just a pass-through
-/// @tparam T Type
-/// @details remove_pointer<float>::type = float
-template <typename T>
-struct remove_pointer {using type = T;};
-
-/// @brief Template specialization of trait class use to to remove pointer, i.e. "*", qualifier from a type
-/// @tparam T Type of the pointer
-/// @details remove_pointer<float*>::type = float
-template <typename T>
-struct remove_pointer<T*> {using type = T;};
-
-// --------------------------> match_const <------------------------------------
-
-/// @brief Trait used to transfer the const-ness of a reference type to another type
-/// @tparam T Type whose const-ness needs to match the reference type
-/// @tparam ReferenceT Reference type that is not const
-/// @details match_const<const int, float>::type = int
-///          match_const<int, float>::type = int
-template<typename T, typename ReferenceT>
-struct match_const
-{
-    using type = typename remove_const<T>::type;
-};
-
-/// @brief Template specialization used to transfer the const-ness of a reference type to another type
-/// @tparam T Type that will adopt the const-ness of the reference type
-/// @tparam ReferenceT Reference type that is const
-/// @details match_const<const int, const float>::type = const int
-///          match_const<int, const float>::type = const int
-template<typename T, typename ReferenceT>
-struct match_const<T, const ReferenceT>
-{
-    using type = const typename remove_const<T>::type;
-};
-
-// --------------------------> is_specialization <------------------------------------
-
-/// @brief Metafunction used to determine if the first template
-///        parameter is a specialization of the class template
-///        given in the second template parameter.
-///
-/// @details is_specialization<Vec3<float>, Vec3>::value == true;
-///          is_specialization<Vec3f, Vec3>::value == true;
-///          is_specialization<std::vector<float>, std::vector>::value == true;
-template<typename AnyType, template<typename...> class TemplateType>
-struct is_specialization
-{
-    static const bool value = false;
-};
-template<typename... Args, template<typename...> class TemplateType>
-struct is_specialization<TemplateType<Args...>, TemplateType>
-{
-    static const bool value = true;
-};
-
 // --------------------------> BuildToValueMap <------------------------------------
 
 /// @brief Maps one type (e.g. the build types above) to other (actual) types
@@ -710,83 +543,19 @@ struct BuildToValueMap<Point>
 
 // --------------------------> utility functions related to alignment <------------------------------------
 
-/// @brief return true if the specified pointer is aligned
-__hostdev__ inline static bool isAligned(const void* p)
-{
-    return uint64_t(p) % NANOVDB_DATA_ALIGNMENT == 0;
-}
-
-/// @brief return true if the specified pointer is aligned and not NULL
-__hostdev__ inline static bool isValid(const void* p)
-{
-    return p != nullptr && uint64_t(p) % NANOVDB_DATA_ALIGNMENT == 0;
-}
+/// @brief return true if the specified pointer is 32 byte aligned
+__hostdev__ inline static bool isAligned(const void* p){return uint64_t(p) % NANOVDB_DATA_ALIGNMENT == 0;}
 
-/// @brief return the smallest number of bytes that when added to the specified pointer results in an aligned pointer
+/// @brief return the smallest number of bytes that when added to the specified pointer results in a 32 byte aligned pointer.
 __hostdev__ inline static uint64_t alignmentPadding(const void* p)
 {
     NANOVDB_ASSERT(p);
     return (NANOVDB_DATA_ALIGNMENT - (uint64_t(p) % NANOVDB_DATA_ALIGNMENT)) % NANOVDB_DATA_ALIGNMENT;
 }
 
-/// @brief offset the specified pointer so it is aligned.
-template <typename T>
-__hostdev__ inline static T* alignPtr(T* p)
-{
-    NANOVDB_ASSERT(p);
-    return reinterpret_cast<T*>( (uint8_t*)p + alignmentPadding(p) );
-}
-
-/// @brief offset the specified const pointer so it is aligned.
+/// @brief offset the specified pointer so it is 32 byte aligned. Works with both const and non-const pointers.
 template <typename T>
-__hostdev__ inline static const T* alignPtr(const T* p)
-{
-    NANOVDB_ASSERT(p);
-    return reinterpret_cast<const T*>( (const uint8_t*)p + alignmentPadding(p) );
-}
-
-// --------------------------> PtrDiff <------------------------------------
-
-/// @brief Compute the distance, in bytes, between two pointers
-/// @tparam T1 Type of the first pointer
-/// @tparam T2 Type of the second pointer
-/// @param p fist pointer, assumed to NOT be NULL
-/// @param q second pointer, assumed to NOT be NULL
-/// @return signed distance between pointer addresses in units of bytes
-template<typename T1, typename T2>
-__hostdev__ inline static int64_t PtrDiff(const T1* p, const T2* q)
-{
-    NANOVDB_ASSERT(p && q);
-    return reinterpret_cast<const char*>(p) - reinterpret_cast<const char*>(q);
-}
-
-// --------------------------> PtrAdd <------------------------------------
-
-/// @brief Adds a byte offset of a non-const pointer to produce another non-const pointer
-/// @tparam DstT Type of the return pointer
-/// @tparam SrcT Type of the input pointer
-/// @param p non-const input pointer, assumed to NOT be NULL
-/// @param offset signed byte offset
-/// @return a non-const pointer defined as the offset of an input pointer
-template<typename DstT, typename SrcT>
-__hostdev__ inline static DstT* PtrAdd(SrcT* p, int64_t offset)
-{
-    NANOVDB_ASSERT(p);
-    return reinterpret_cast<DstT*>(reinterpret_cast<char*>(p) + offset);
-}
-
-/// @brief Adds a byte offset of a const pointer to produce another const pointer
-/// @tparam DstT Type of the return pointer
-/// @tparam SrcT Type of the input pointer
-/// @param p const input pointer, assumed to NOT be NULL
-/// @param offset signed byte offset
-/// @return a const pointer defined as the offset of a const input pointer
-template<typename DstT, typename SrcT>
-__hostdev__ inline static const DstT* PtrAdd(const SrcT* p, int64_t offset)
-{
-    NANOVDB_ASSERT(p);
-    return reinterpret_cast<const DstT*>(reinterpret_cast<const char*>(p) + offset);
-}
+__hostdev__ inline static T* alignPtr(T* p){return util::PtrAdd<T>(p, alignmentPadding(p));}
 
 // --------------------------> isFloatingPoint(GridType) <------------------------------------
 
@@ -822,7 +591,8 @@ __hostdev__ inline bool isInteger(GridType gridType)
     return gridType == GridType::Int16 ||
            gridType == GridType::Int32 ||
            gridType == GridType::Int64 ||
-           gridType == GridType::UInt32;
+           gridType == GridType::UInt32||
+           gridType == GridType::UInt8;
 }
 
 // --------------------------> isIndex(GridType) <------------------------------------
@@ -837,23 +607,6 @@ __hostdev__ inline bool isIndex(GridType gridType)
            gridType == GridType::OnIndexMask;// as OnIndex, but with an additional mask
 }
 
-// --------------------------> memcpy64 <------------------------------------
-
-/// @brief copy 64 bit words from @c src to @c dst
-/// @param dst 64 bit aligned pointer to destination
-/// @param src 64 bit aligned pointer to source
-/// @param word_count number of 64 bit words to be copied
-/// @return destination pointer @c dst
-/// @warning @c src and @c dst cannot overlap and should both be 64 bit aligned
-__hostdev__ inline static void* memcpy64(void *dst, const void *src, size_t word_count)
-{
-    NANOVDB_ASSERT(uint64_t(dst) % 8 == 0 && uint64_t(src) % 8 == 0);
-    auto *d = reinterpret_cast<uint64_t*>(dst), *e = d + word_count;
-    auto *s = reinterpret_cast<const uint64_t*>(src);
-    while (d != e) *d++ = *s++;
-    return dst;
-}
-
 // --------------------------> isValue(GridType, GridClass) <------------------------------------
 
 /// @brief return true if the combination of GridType and GridClass is valid.
@@ -872,7 +625,8 @@ __hostdev__ inline bool isValid(GridType gridType, GridClass gridClass)
     } else if (gridClass == GridClass::VoxelVolume) {
         return gridType == GridType::RGBA8 || gridType == GridType::Float ||
                gridType == GridType::Double || gridType == GridType::Vec3f ||
-               gridType == GridType::Vec3d || gridType == GridType::UInt32;
+               gridType == GridType::Vec3d || gridType == GridType::UInt32 ||
+               gridType == GridType::UInt8;
     }
     return gridClass < GridClass::End && gridType < GridType::End; // any valid combination
 }
@@ -925,6 +679,7 @@ class Version
 {
     uint32_t mData; // 11 + 11 + 10 bit packing of major + minor + patch
 public:
+    static constexpr uint32_t End = 0, StrLen = 8;// for strlen<Version>()
     /// @brief Default constructor
     __hostdev__ Version()
         : mData(uint32_t(NANOVDB_MAJOR_VERSION_NUMBER) << 21 |
@@ -954,1710 +709,191 @@ class Version
     __hostdev__ bool isCompatible() const { return this->getMajor() == uint32_t(NANOVDB_MAJOR_VERSION_NUMBER); }
     /// @brief Returns the difference between major version of this instance and NANOVDB_MAJOR_VERSION_NUMBER
     /// @return return 0 if the major version equals NANOVDB_MAJOR_VERSION_NUMBER, else a negative age if this
-    ///         instance has a smaller major verion (is older), and a positive age if it is newer, i.e. larger.
-    __hostdev__ int age() const {return int(this->getMajor()) - int(NANOVDB_MAJOR_VERSION_NUMBER);}
-
-#ifndef __CUDACC_RTC__
-    /// @brief returns a c-string of the semantic version, i.e. major.minor.patch
-    const char* c_str() const
-    {
-        char* buffer = (char*)malloc(4 + 1 + 4 + 1 + 4 + 1); // xxxx.xxxx.xxxx\0
-        snprintf(buffer, 4 + 1 + 4 + 1 + 4 + 1, "%u.%u.%u", this->getMajor(), this->getMinor(), this->getPatch()); // Prevents overflows by enforcing a fixed size of buffer
-        return buffer;
-    }
-#endif
-}; // Version
-
-// ----------------------------> Various math functions <-------------------------------------
-
-//@{
-/// @brief Pi constant taken from Boost to match old behaviour
-template<typename T>
-inline __hostdev__ constexpr T pi()
-{
-    return 3.141592653589793238462643383279502884e+00;
-}
-template<>
-inline __hostdev__ constexpr float pi()
-{
-    return 3.141592653589793238462643383279502884e+00F;
-}
-template<>
-inline __hostdev__ constexpr double pi()
-{
-    return 3.141592653589793238462643383279502884e+00;
-}
-template<>
-inline __hostdev__ constexpr long double pi()
-{
-    return 3.141592653589793238462643383279502884e+00L;
-}
-//@}
-
-//@{
-/// Tolerance for floating-point comparison
-template<typename T>
-struct Tolerance;
-template<>
-struct Tolerance<float>
-{
-    __hostdev__ static float value() { return 1e-8f; }
-};
-template<>
-struct Tolerance<double>
-{
-    __hostdev__ static double value() { return 1e-15; }
-};
-//@}
-
-//@{
-/// Delta for small floating-point offsets
-template<typename T>
-struct Delta;
-template<>
-struct Delta<float>
-{
-    __hostdev__ static float value() { return 1e-5f; }
-};
-template<>
-struct Delta<double>
-{
-    __hostdev__ static double value() { return 1e-9; }
-};
-//@}
-
-//@{
-/// Maximum floating-point values
-template<typename T>
-struct Maximum;
-#if defined(__CUDA_ARCH__) || defined(__HIP__)
-template<>
-struct Maximum<int>
-{
-    __hostdev__ static int value() { return 2147483647; }
-};
-template<>
-struct Maximum<uint32_t>
-{
-    __hostdev__ static uint32_t value() { return 4294967295u; }
-};
-template<>
-struct Maximum<float>
-{
-    __hostdev__ static float value() { return 1e+38f; }
-};
-template<>
-struct Maximum<double>
-{
-    __hostdev__ static double value() { return 1e+308; }
-};
-#else
-template<typename T>
-struct Maximum
-{
-    static T value() { return std::numeric_limits<T>::max(); }
-};
-#endif
-//@}
-
-template<typename Type>
-__hostdev__ inline bool isApproxZero(const Type& x)
-{
-    return !(x > Tolerance<Type>::value()) && !(x < -Tolerance<Type>::value());
-}
-
-template<typename Type>
-__hostdev__ inline Type Min(Type a, Type b)
-{
-    return (a < b) ? a : b;
-}
-__hostdev__ inline int32_t Min(int32_t a, int32_t b)
-{
-    return int32_t(fminf(float(a), float(b)));
-}
-__hostdev__ inline uint32_t Min(uint32_t a, uint32_t b)
-{
-    return uint32_t(fminf(float(a), float(b)));
-}
-__hostdev__ inline float Min(float a, float b)
-{
-    return fminf(a, b);
-}
-__hostdev__ inline double Min(double a, double b)
-{
-    return fmin(a, b);
-}
-template<typename Type>
-__hostdev__ inline Type Max(Type a, Type b)
-{
-    return (a > b) ? a : b;
-}
-
-__hostdev__ inline int32_t Max(int32_t a, int32_t b)
-{
-    return int32_t(fmaxf(float(a), float(b)));
-}
-__hostdev__ inline uint32_t Max(uint32_t a, uint32_t b)
-{
-    return uint32_t(fmaxf(float(a), float(b)));
-}
-__hostdev__ inline float Max(float a, float b)
-{
-    return fmaxf(a, b);
-}
-__hostdev__ inline double Max(double a, double b)
-{
-    return fmax(a, b);
-}
-__hostdev__ inline float Clamp(float x, float a, float b)
-{
-    return Max(Min(x, b), a);
-}
-__hostdev__ inline double Clamp(double x, double a, double b)
-{
-    return Max(Min(x, b), a);
-}
-
-__hostdev__ inline float Fract(float x)
-{
-    return x - floorf(x);
-}
-__hostdev__ inline double Fract(double x)
-{
-    return x - floor(x);
-}
-
-__hostdev__ inline int32_t Floor(float x)
-{
-    return int32_t(floorf(x));
-}
-__hostdev__ inline int32_t Floor(double x)
-{
-    return int32_t(floor(x));
-}
-
-__hostdev__ inline int32_t Ceil(float x)
-{
-    return int32_t(ceilf(x));
-}
-__hostdev__ inline int32_t Ceil(double x)
-{
-    return int32_t(ceil(x));
-}
-
-template<typename T>
-__hostdev__ inline T Pow2(T x)
-{
-    return x * x;
-}
-
-template<typename T>
-__hostdev__ inline T Pow3(T x)
-{
-    return x * x * x;
-}
-
-template<typename T>
-__hostdev__ inline T Pow4(T x)
-{
-    return Pow2(x * x);
-}
-template<typename T>
-__hostdev__ inline T Abs(T x)
-{
-    return x < 0 ? -x : x;
-}
-
-template<>
-__hostdev__ inline float Abs(float x)
-{
-    return fabsf(x);
-}
-
-template<>
-__hostdev__ inline double Abs(double x)
-{
-    return fabs(x);
-}
-
-template<>
-__hostdev__ inline int Abs(int x)
-{
-    return abs(x);
-}
-
-template<typename CoordT, typename RealT, template<typename> class Vec3T>
-__hostdev__ inline CoordT Round(const Vec3T<RealT>& xyz);
-
-template<typename CoordT, template<typename> class Vec3T>
-__hostdev__ inline CoordT Round(const Vec3T<float>& xyz)
-{
-    return CoordT(int32_t(rintf(xyz[0])), int32_t(rintf(xyz[1])), int32_t(rintf(xyz[2])));
-    //return CoordT(int32_t(roundf(xyz[0])), int32_t(roundf(xyz[1])), int32_t(roundf(xyz[2])) );
-    //return CoordT(int32_t(floorf(xyz[0] + 0.5f)), int32_t(floorf(xyz[1] + 0.5f)), int32_t(floorf(xyz[2] + 0.5f)));
-}
-
-template<typename CoordT, template<typename> class Vec3T>
-__hostdev__ inline CoordT Round(const Vec3T<double>& xyz)
-{
-    return CoordT(int32_t(floor(xyz[0] + 0.5)), int32_t(floor(xyz[1] + 0.5)), int32_t(floor(xyz[2] + 0.5)));
-}
-
-template<typename CoordT, typename RealT, template<typename> class Vec3T>
-__hostdev__ inline CoordT RoundDown(const Vec3T<RealT>& xyz)
-{
-    return CoordT(Floor(xyz[0]), Floor(xyz[1]), Floor(xyz[2]));
-}
-
-//@{
-/// Return the square root of a floating-point value.
-__hostdev__ inline float Sqrt(float x)
-{
-    return sqrtf(x);
-}
-__hostdev__ inline double Sqrt(double x)
-{
-    return sqrt(x);
-}
-//@}
-
-/// Return the sign of the given value as an integer (either -1, 0 or 1).
-template<typename T>
-__hostdev__ inline T Sign(const T& x)
-{
-    return ((T(0) < x) ? T(1) : T(0)) - ((x < T(0)) ? T(1) : T(0));
-}
-
-template<typename Vec3T>
-__hostdev__ inline int MinIndex(const Vec3T& v)
-{
-#if 0
-    static const int hashTable[8] = {2, 1, 9, 1, 2, 9, 0, 0}; //9 are dummy values
-    const int        hashKey = ((v[0] < v[1]) << 2) + ((v[0] < v[2]) << 1) + (v[1] < v[2]); // ?*4+?*2+?*1
-    return hashTable[hashKey];
-#else
-    if (v[0] < v[1] && v[0] < v[2])
-        return 0;
-    if (v[1] < v[2])
-        return 1;
-    else
-        return 2;
-#endif
-}
-
-template<typename Vec3T>
-__hostdev__ inline int MaxIndex(const Vec3T& v)
-{
-#if 0
-    static const int hashTable[8] = {2, 1, 9, 1, 2, 9, 0, 0}; //9 are dummy values
-    const int        hashKey = ((v[0] > v[1]) << 2) + ((v[0] > v[2]) << 1) + (v[1] > v[2]); // ?*4+?*2+?*1
-    return hashTable[hashKey];
-#else
-    if (v[0] > v[1] && v[0] > v[2])
-        return 0;
-    if (v[1] > v[2])
-        return 1;
-    else
-        return 2;
-#endif
-}
-
-/// @brief round up byteSize to the nearest wordSize, e.g. to align to machine word: AlignUp<sizeof(size_t)(n)
-///
-/// @details both wordSize and byteSize are in byte units
-template<uint64_t wordSize>
-__hostdev__ inline uint64_t AlignUp(uint64_t byteCount)
-{
-    const uint64_t r = byteCount % wordSize;
-    return r ? byteCount - r + wordSize : byteCount;
-}
-
-// ------------------------------> Coord <--------------------------------------
-
-// forward declaration so we can define Coord::asVec3s and Coord::asVec3d
-template<typename>
-class Vec3;
-
-/// @brief Signed (i, j, k) 32-bit integer coordinate class, similar to openvdb::math::Coord
-class Coord
-{
-    int32_t mVec[3]; // private member data - three signed index coordinates
-public:
-    using ValueType = int32_t;
-    using IndexType = uint32_t;
-
-    /// @brief Initialize all coordinates to zero.
-    __hostdev__ Coord()
-        : mVec{0, 0, 0}
-    {
-    }
-
-    /// @brief Initializes all coordinates to the given signed integer.
-    __hostdev__ explicit Coord(ValueType n)
-        : mVec{n, n, n}
-    {
-    }
-
-    /// @brief Initializes coordinate to the given signed integers.
-    __hostdev__ Coord(ValueType i, ValueType j, ValueType k)
-        : mVec{i, j, k}
-    {
-    }
-
-    __hostdev__ Coord(ValueType* ptr)
-        : mVec{ptr[0], ptr[1], ptr[2]}
-    {
-    }
-
-    __hostdev__ int32_t x() const { return mVec[0]; }
-    __hostdev__ int32_t y() const { return mVec[1]; }
-    __hostdev__ int32_t z() const { return mVec[2]; }
-
-    __hostdev__ int32_t& x() { return mVec[0]; }
-    __hostdev__ int32_t& y() { return mVec[1]; }
-    __hostdev__ int32_t& z() { return mVec[2]; }
-
-    __hostdev__ static Coord max() { return Coord(int32_t((1u << 31) - 1)); }
-
-    __hostdev__ static Coord min() { return Coord(-int32_t((1u << 31) - 1) - 1); }
-
-    __hostdev__ static size_t memUsage() { return sizeof(Coord); }
-
-    /// @brief Return a const reference to the given Coord component.
-    /// @warning The argument is assumed to be 0, 1, or 2.
-    __hostdev__ const ValueType& operator[](IndexType i) const { return mVec[i]; }
-
-    /// @brief Return a non-const reference to the given Coord component.
-    /// @warning The argument is assumed to be 0, 1, or 2.
-    __hostdev__ ValueType& operator[](IndexType i) { return mVec[i]; }
-
-    /// @brief Assignment operator that works with openvdb::Coord
-    template<typename CoordT>
-    __hostdev__ Coord& operator=(const CoordT& other)
-    {
-        static_assert(sizeof(Coord) == sizeof(CoordT), "Mis-matched sizeof");
-        mVec[0] = other[0];
-        mVec[1] = other[1];
-        mVec[2] = other[2];
-        return *this;
-    }
-
-    /// @brief Return a new instance with coordinates masked by the given unsigned integer.
-    __hostdev__ Coord operator&(IndexType n) const { return Coord(mVec[0] & n, mVec[1] & n, mVec[2] & n); }
-
-    // @brief Return a new instance with coordinates left-shifted by the given unsigned integer.
-    __hostdev__ Coord operator<<(IndexType n) const { return Coord(mVec[0] << n, mVec[1] << n, mVec[2] << n); }
-
-    // @brief Return a new instance with coordinates right-shifted by the given unsigned integer.
-    __hostdev__ Coord operator>>(IndexType n) const { return Coord(mVec[0] >> n, mVec[1] >> n, mVec[2] >> n); }
-
-    /// @brief Return true if this Coord is lexicographically less than the given Coord.
-    __hostdev__ bool operator<(const Coord& rhs) const
-    {
-        return mVec[0] < rhs[0] ? true
-             : mVec[0] > rhs[0] ? false
-             : mVec[1] < rhs[1] ? true
-             : mVec[1] > rhs[1] ? false
-             : mVec[2] < rhs[2] ? true : false;
-    }
-
-    /// @brief Return true if this Coord is lexicographically less or equal to the given Coord.
-    __hostdev__ bool operator<=(const Coord& rhs) const
-    {
-        return mVec[0] < rhs[0] ? true
-             : mVec[0] > rhs[0] ? false
-             : mVec[1] < rhs[1] ? true
-             : mVec[1] > rhs[1] ? false
-             : mVec[2] <=rhs[2] ? true : false;
-    }
-
-    // @brief Return true if the Coord components are identical.
-    __hostdev__ bool   operator==(const Coord& rhs) const { return mVec[0] == rhs[0] && mVec[1] == rhs[1] && mVec[2] == rhs[2]; }
-    __hostdev__ bool   operator!=(const Coord& rhs) const { return mVec[0] != rhs[0] || mVec[1] != rhs[1] || mVec[2] != rhs[2]; }
-    __hostdev__ Coord& operator&=(int n)
-    {
-        mVec[0] &= n;
-        mVec[1] &= n;
-        mVec[2] &= n;
-        return *this;
-    }
-    __hostdev__ Coord& operator<<=(uint32_t n)
-    {
-        mVec[0] <<= n;
-        mVec[1] <<= n;
-        mVec[2] <<= n;
-        return *this;
-    }
-    __hostdev__ Coord& operator>>=(uint32_t n)
-    {
-        mVec[0] >>= n;
-        mVec[1] >>= n;
-        mVec[2] >>= n;
-        return *this;
-    }
-    __hostdev__ Coord& operator+=(int n)
-    {
-        mVec[0] += n;
-        mVec[1] += n;
-        mVec[2] += n;
-        return *this;
-    }
-    __hostdev__ Coord  operator+(const Coord& rhs) const { return Coord(mVec[0] + rhs[0], mVec[1] + rhs[1], mVec[2] + rhs[2]); }
-    __hostdev__ Coord  operator-(const Coord& rhs) const { return Coord(mVec[0] - rhs[0], mVec[1] - rhs[1], mVec[2] - rhs[2]); }
-    __hostdev__ Coord  operator-() const { return Coord(-mVec[0], -mVec[1], -mVec[2]); }
-    __hostdev__ Coord& operator+=(const Coord& rhs)
-    {
-        mVec[0] += rhs[0];
-        mVec[1] += rhs[1];
-        mVec[2] += rhs[2];
-        return *this;
-    }
-    __hostdev__ Coord& operator-=(const Coord& rhs)
-    {
-        mVec[0] -= rhs[0];
-        mVec[1] -= rhs[1];
-        mVec[2] -= rhs[2];
-        return *this;
-    }
-
-    /// @brief Perform a component-wise minimum with the other Coord.
-    __hostdev__ Coord& minComponent(const Coord& other)
-    {
-        if (other[0] < mVec[0])
-            mVec[0] = other[0];
-        if (other[1] < mVec[1])
-            mVec[1] = other[1];
-        if (other[2] < mVec[2])
-            mVec[2] = other[2];
-        return *this;
-    }
-
-    /// @brief Perform a component-wise maximum with the other Coord.
-    __hostdev__ Coord& maxComponent(const Coord& other)
-    {
-        if (other[0] > mVec[0])
-            mVec[0] = other[0];
-        if (other[1] > mVec[1])
-            mVec[1] = other[1];
-        if (other[2] > mVec[2])
-            mVec[2] = other[2];
-        return *this;
-    }
-#if defined(__CUDACC__) // the following functions only run on the GPU!
-    __device__ inline Coord& minComponentAtomic(const Coord& other)
-    {
-        atomicMin(&mVec[0], other[0]);
-        atomicMin(&mVec[1], other[1]);
-        atomicMin(&mVec[2], other[2]);
-        return *this;
-    }
-    __device__ inline Coord& maxComponentAtomic(const Coord& other)
-    {
-        atomicMax(&mVec[0], other[0]);
-        atomicMax(&mVec[1], other[1]);
-        atomicMax(&mVec[2], other[2]);
-        return *this;
-    }
-#endif
-
-    __hostdev__ Coord offsetBy(ValueType dx, ValueType dy, ValueType dz) const
-    {
-        return Coord(mVec[0] + dx, mVec[1] + dy, mVec[2] + dz);
-    }
-
-    __hostdev__ Coord offsetBy(ValueType n) const { return this->offsetBy(n, n, n); }
-
-    /// Return true if any of the components of @a a are smaller than the
-    /// corresponding components of @a b.
-    __hostdev__ static inline bool lessThan(const Coord& a, const Coord& b)
-    {
-        return (a[0] < b[0] || a[1] < b[1] || a[2] < b[2]);
-    }
-
-    /// @brief Return the largest integer coordinates that are not greater
-    /// than @a xyz (node centered conversion).
-    template<typename Vec3T>
-    __hostdev__ static Coord Floor(const Vec3T& xyz) { return Coord(nanovdb::Floor(xyz[0]), nanovdb::Floor(xyz[1]), nanovdb::Floor(xyz[2])); }
-
-    /// @brief Return a hash key derived from the existing coordinates.
-    /// @details The hash function is originally taken from the SIGGRAPH paper:
-    ///          "VDB: High-resolution sparse volumes with dynamic topology"
-    ///          and the prime numbers are modified based on the ACM Transactions on Graphics paper:
-    ///          "Real-time 3D reconstruction at scale using voxel hashing" (the second number had a typo!)
-    template<int Log2N = 3 + 4 + 5>
-    __hostdev__ uint32_t hash() const { return ((1 << Log2N) - 1) & (mVec[0] * 73856093 ^ mVec[1] * 19349669 ^ mVec[2] * 83492791); }
-
-    /// @brief Return the octant of this Coord
-    //__hostdev__ size_t octant() const { return (uint32_t(mVec[0])>>31) | ((uint32_t(mVec[1])>>31)<<1) | ((uint32_t(mVec[2])>>31)<<2); }
-    __hostdev__ uint8_t octant() const { return (uint8_t(bool(mVec[0] & (1u << 31)))) |
-                                                (uint8_t(bool(mVec[1] & (1u << 31))) << 1) |
-                                                (uint8_t(bool(mVec[2] & (1u << 31))) << 2); }
-
-    /// @brief Return a single precision floating-point vector of this coordinate
-    __hostdev__ inline Vec3<float> asVec3s() const;
-
-    /// @brief Return a double precision floating-point vector of this coordinate
-    __hostdev__ inline Vec3<double> asVec3d() const;
-
-    // returns a copy of itself, so it mimics the behaviour of Vec3<T>::round()
-    __hostdev__ inline Coord round() const { return *this; }
-}; // Coord class
-
-// ----------------------------> Vec3 <--------------------------------------
-
-/// @brief A simple vector class with three components, similar to openvdb::math::Vec3
-template<typename T>
-class Vec3
-{
-    T mVec[3];
-
-public:
-    static const int SIZE = 3;
-    static const int size = 3; // in openvdb::math::Tuple
-    using ValueType = T;
-    Vec3() = default;
-    __hostdev__ explicit Vec3(T x)
-        : mVec{x, x, x}
-    {
-    }
-    __hostdev__ Vec3(T x, T y, T z)
-        : mVec{x, y, z}
-    {
-    }
-    template<template<class> class Vec3T, class T2>
-    __hostdev__ Vec3(const Vec3T<T2>& v)
-        : mVec{T(v[0]), T(v[1]), T(v[2])}
-    {
-        static_assert(Vec3T<T2>::size == size, "expected Vec3T::size==3!");
-    }
-    template<typename T2>
-    __hostdev__ explicit Vec3(const Vec3<T2>& v)
-        : mVec{T(v[0]), T(v[1]), T(v[2])}
-    {
-    }
-    __hostdev__ explicit Vec3(const Coord& ijk)
-        : mVec{T(ijk[0]), T(ijk[1]), T(ijk[2])}
-    {
-    }
-    __hostdev__ bool operator==(const Vec3& rhs) const { return mVec[0] == rhs[0] && mVec[1] == rhs[1] && mVec[2] == rhs[2]; }
-    __hostdev__ bool operator!=(const Vec3& rhs) const { return mVec[0] != rhs[0] || mVec[1] != rhs[1] || mVec[2] != rhs[2]; }
-    template<template<class> class Vec3T, class T2>
-    __hostdev__ Vec3& operator=(const Vec3T<T2>& rhs)
-    {
-        static_assert(Vec3T<T2>::size == size, "expected Vec3T::size==3!");
-        mVec[0] = rhs[0];
-        mVec[1] = rhs[1];
-        mVec[2] = rhs[2];
-        return *this;
-    }
-    __hostdev__ const T& operator[](int i) const { return mVec[i]; }
-    __hostdev__ T&       operator[](int i) { return mVec[i]; }
-    template<typename Vec3T>
-    __hostdev__ T dot(const Vec3T& v) const { return mVec[0] * v[0] + mVec[1] * v[1] + mVec[2] * v[2]; }
-    template<typename Vec3T>
-    __hostdev__ Vec3 cross(const Vec3T& v) const
-    {
-        return Vec3(mVec[1] * v[2] - mVec[2] * v[1],
-                    mVec[2] * v[0] - mVec[0] * v[2],
-                    mVec[0] * v[1] - mVec[1] * v[0]);
-    }
-    __hostdev__ T lengthSqr() const
-    {
-        return mVec[0] * mVec[0] + mVec[1] * mVec[1] + mVec[2] * mVec[2]; // 5 flops
-    }
-    __hostdev__ T     length() const { return Sqrt(this->lengthSqr()); }
-    __hostdev__ Vec3  operator-() const { return Vec3(-mVec[0], -mVec[1], -mVec[2]); }
-    __hostdev__ Vec3  operator*(const Vec3& v) const { return Vec3(mVec[0] * v[0], mVec[1] * v[1], mVec[2] * v[2]); }
-    __hostdev__ Vec3  operator/(const Vec3& v) const { return Vec3(mVec[0] / v[0], mVec[1] / v[1], mVec[2] / v[2]); }
-    __hostdev__ Vec3  operator+(const Vec3& v) const { return Vec3(mVec[0] + v[0], mVec[1] + v[1], mVec[2] + v[2]); }
-    __hostdev__ Vec3  operator-(const Vec3& v) const { return Vec3(mVec[0] - v[0], mVec[1] - v[1], mVec[2] - v[2]); }
-    __hostdev__ Vec3  operator+(const Coord& ijk) const { return Vec3(mVec[0] + ijk[0], mVec[1] + ijk[1], mVec[2] + ijk[2]); }
-    __hostdev__ Vec3  operator-(const Coord& ijk) const { return Vec3(mVec[0] - ijk[0], mVec[1] - ijk[1], mVec[2] - ijk[2]); }
-    __hostdev__ Vec3  operator*(const T& s) const { return Vec3(s * mVec[0], s * mVec[1], s * mVec[2]); }
-    __hostdev__ Vec3  operator/(const T& s) const { return (T(1) / s) * (*this); }
-    __hostdev__ Vec3& operator+=(const Vec3& v)
-    {
-        mVec[0] += v[0];
-        mVec[1] += v[1];
-        mVec[2] += v[2];
-        return *this;
-    }
-    __hostdev__ Vec3& operator+=(const Coord& ijk)
-    {
-        mVec[0] += T(ijk[0]);
-        mVec[1] += T(ijk[1]);
-        mVec[2] += T(ijk[2]);
-        return *this;
-    }
-    __hostdev__ Vec3& operator-=(const Vec3& v)
-    {
-        mVec[0] -= v[0];
-        mVec[1] -= v[1];
-        mVec[2] -= v[2];
-        return *this;
-    }
-    __hostdev__ Vec3& operator-=(const Coord& ijk)
-    {
-        mVec[0] -= T(ijk[0]);
-        mVec[1] -= T(ijk[1]);
-        mVec[2] -= T(ijk[2]);
-        return *this;
-    }
-    __hostdev__ Vec3& operator*=(const T& s)
-    {
-        mVec[0] *= s;
-        mVec[1] *= s;
-        mVec[2] *= s;
-        return *this;
-    }
-    __hostdev__ Vec3& operator/=(const T& s) { return (*this) *= T(1) / s; }
-    __hostdev__ Vec3& normalize() { return (*this) /= this->length(); }
-    /// @brief Perform a component-wise minimum with the other Coord.
-    __hostdev__ Vec3& minComponent(const Vec3& other)
-    {
-        if (other[0] < mVec[0])
-            mVec[0] = other[0];
-        if (other[1] < mVec[1])
-            mVec[1] = other[1];
-        if (other[2] < mVec[2])
-            mVec[2] = other[2];
-        return *this;
-    }
-
-    /// @brief Perform a component-wise maximum with the other Coord.
-    __hostdev__ Vec3& maxComponent(const Vec3& other)
-    {
-        if (other[0] > mVec[0])
-            mVec[0] = other[0];
-        if (other[1] > mVec[1])
-            mVec[1] = other[1];
-        if (other[2] > mVec[2])
-            mVec[2] = other[2];
-        return *this;
-    }
-    /// @brief Return the smallest vector component
-    __hostdev__ ValueType min() const
-    {
-        return mVec[0] < mVec[1] ? (mVec[0] < mVec[2] ? mVec[0] : mVec[2]) : (mVec[1] < mVec[2] ? mVec[1] : mVec[2]);
-    }
-    /// @brief Return the largest vector component
-    __hostdev__ ValueType max() const
-    {
-        return mVec[0] > mVec[1] ? (mVec[0] > mVec[2] ? mVec[0] : mVec[2]) : (mVec[1] > mVec[2] ? mVec[1] : mVec[2]);
-    }
-    /// @brief Round each component if this Vec<T> up to its integer value
-    /// @return Return an integer Coord
-    __hostdev__ Coord floor() const { return Coord(Floor(mVec[0]), Floor(mVec[1]), Floor(mVec[2])); }
-    /// @brief Round each component if this Vec<T> down to its integer value
-    /// @return Return an integer Coord
-    __hostdev__ Coord ceil() const { return Coord(Ceil(mVec[0]), Ceil(mVec[1]), Ceil(mVec[2])); }
-    /// @brief Round each component if this Vec<T> to its closest integer value
-    /// @return Return an integer Coord
-    __hostdev__ Coord round() const
-    {
-        if constexpr(is_same<T, float>::value) {
-            return Coord(Floor(mVec[0] + 0.5f), Floor(mVec[1] + 0.5f), Floor(mVec[2] + 0.5f));
-        } else if constexpr(is_same<T, int>::value) {
-            return Coord(mVec[0], mVec[1], mVec[2]);
-        } else {
-            return Coord(Floor(mVec[0] + 0.5), Floor(mVec[1] + 0.5), Floor(mVec[2] + 0.5));
-        }
-    }
-
-    /// @brief return a non-const raw constant pointer to array of three vector components
-    __hostdev__ T* asPointer() { return mVec; }
-    /// @brief return a const raw constant pointer to array of three vector components
-    __hostdev__ const T* asPointer() const { return mVec; }
-}; // Vec3<T>
-
-template<typename T1, typename T2>
-__hostdev__ inline Vec3<T2> operator*(T1 scalar, const Vec3<T2>& vec)
-{
-    return Vec3<T2>(scalar * vec[0], scalar * vec[1], scalar * vec[2]);
-}
-template<typename T1, typename T2>
-__hostdev__ inline Vec3<T2> operator/(T1 scalar, const Vec3<T2>& vec)
-{
-    return Vec3<T2>(scalar / vec[0], scalar / vec[1], scalar / vec[2]);
-}
-
-//using Vec3R = Vec3<double>;// deprecated
-using Vec3d = Vec3<double>;
-using Vec3f = Vec3<float>;
-using Vec3i = Vec3<int32_t>;
-using Vec3u = Vec3<uint32_t>;
-using Vec3u8 = Vec3<uint8_t>;
-using Vec3u16 = Vec3<uint16_t>;
-
-/// @brief Return a single precision floating-point vector of this coordinate
-__hostdev__ inline Vec3f Coord::asVec3s() const
-{
-    return Vec3f(float(mVec[0]), float(mVec[1]), float(mVec[2]));
-}
-
-/// @brief Return a double precision floating-point vector of this coordinate
-__hostdev__ inline Vec3d Coord::asVec3d() const
-{
-    return Vec3d(double(mVec[0]), double(mVec[1]), double(mVec[2]));
-}
-
-// ----------------------------> Vec4 <--------------------------------------
-
-/// @brief A simple vector class with four components, similar to openvdb::math::Vec4
-template<typename T>
-class Vec4
-{
-    T mVec[4];
-
-public:
-    static const int SIZE = 4;
-    static const int size = 4;
-    using ValueType = T;
-    Vec4() = default;
-    __hostdev__ explicit Vec4(T x)
-        : mVec{x, x, x, x}
-    {
-    }
-    __hostdev__ Vec4(T x, T y, T z, T w)
-        : mVec{x, y, z, w}
-    {
-    }
-    template<typename T2>
-    __hostdev__ explicit Vec4(const Vec4<T2>& v)
-        : mVec{T(v[0]), T(v[1]), T(v[2]), T(v[3])}
-    {
-    }
-    template<template<class> class Vec4T, class T2>
-    __hostdev__ Vec4(const Vec4T<T2>& v)
-        : mVec{T(v[0]), T(v[1]), T(v[2]), T(v[3])}
-    {
-        static_assert(Vec4T<T2>::size == size, "expected Vec4T::size==4!");
-    }
-    __hostdev__ bool operator==(const Vec4& rhs) const { return mVec[0] == rhs[0] && mVec[1] == rhs[1] && mVec[2] == rhs[2] && mVec[3] == rhs[3]; }
-    __hostdev__ bool operator!=(const Vec4& rhs) const { return mVec[0] != rhs[0] || mVec[1] != rhs[1] || mVec[2] != rhs[2] || mVec[3] != rhs[3]; }
-    template<template<class> class Vec4T, class T2>
-    __hostdev__ Vec4& operator=(const Vec4T<T2>& rhs)
-    {
-        static_assert(Vec4T<T2>::size == size, "expected Vec4T::size==4!");
-        mVec[0] = rhs[0];
-        mVec[1] = rhs[1];
-        mVec[2] = rhs[2];
-        mVec[3] = rhs[3];
-        return *this;
-    }
-
-    __hostdev__ const T& operator[](int i) const { return mVec[i]; }
-    __hostdev__ T&       operator[](int i) { return mVec[i]; }
-    template<typename Vec4T>
-    __hostdev__ T dot(const Vec4T& v) const { return mVec[0] * v[0] + mVec[1] * v[1] + mVec[2] * v[2] + mVec[3] * v[3]; }
-    __hostdev__ T lengthSqr() const
-    {
-        return mVec[0] * mVec[0] + mVec[1] * mVec[1] + mVec[2] * mVec[2] + mVec[3] * mVec[3]; // 7 flops
-    }
-    __hostdev__ T     length() const { return Sqrt(this->lengthSqr()); }
-    __hostdev__ Vec4  operator-() const { return Vec4(-mVec[0], -mVec[1], -mVec[2], -mVec[3]); }
-    __hostdev__ Vec4  operator*(const Vec4& v) const { return Vec4(mVec[0] * v[0], mVec[1] * v[1], mVec[2] * v[2], mVec[3] * v[3]); }
-    __hostdev__ Vec4  operator/(const Vec4& v) const { return Vec4(mVec[0] / v[0], mVec[1] / v[1], mVec[2] / v[2], mVec[3] / v[3]); }
-    __hostdev__ Vec4  operator+(const Vec4& v) const { return Vec4(mVec[0] + v[0], mVec[1] + v[1], mVec[2] + v[2], mVec[3] + v[3]); }
-    __hostdev__ Vec4  operator-(const Vec4& v) const { return Vec4(mVec[0] - v[0], mVec[1] - v[1], mVec[2] - v[2], mVec[3] - v[3]); }
-    __hostdev__ Vec4  operator*(const T& s) const { return Vec4(s * mVec[0], s * mVec[1], s * mVec[2], s * mVec[3]); }
-    __hostdev__ Vec4  operator/(const T& s) const { return (T(1) / s) * (*this); }
-    __hostdev__ Vec4& operator+=(const Vec4& v)
-    {
-        mVec[0] += v[0];
-        mVec[1] += v[1];
-        mVec[2] += v[2];
-        mVec[3] += v[3];
-        return *this;
-    }
-    __hostdev__ Vec4& operator-=(const Vec4& v)
-    {
-        mVec[0] -= v[0];
-        mVec[1] -= v[1];
-        mVec[2] -= v[2];
-        mVec[3] -= v[3];
-        return *this;
-    }
-    __hostdev__ Vec4& operator*=(const T& s)
-    {
-        mVec[0] *= s;
-        mVec[1] *= s;
-        mVec[2] *= s;
-        mVec[3] *= s;
-        return *this;
-    }
-    __hostdev__ Vec4& operator/=(const T& s) { return (*this) *= T(1) / s; }
-    __hostdev__ Vec4& normalize() { return (*this) /= this->length(); }
-    /// @brief Perform a component-wise minimum with the other Coord.
-    __hostdev__ Vec4& minComponent(const Vec4& other)
-    {
-        if (other[0] < mVec[0])
-            mVec[0] = other[0];
-        if (other[1] < mVec[1])
-            mVec[1] = other[1];
-        if (other[2] < mVec[2])
-            mVec[2] = other[2];
-        if (other[3] < mVec[3])
-            mVec[3] = other[3];
-        return *this;
-    }
-
-    /// @brief Perform a component-wise maximum with the other Coord.
-    __hostdev__ Vec4& maxComponent(const Vec4& other)
-    {
-        if (other[0] > mVec[0])
-            mVec[0] = other[0];
-        if (other[1] > mVec[1])
-            mVec[1] = other[1];
-        if (other[2] > mVec[2])
-            mVec[2] = other[2];
-        if (other[3] > mVec[3])
-            mVec[3] = other[3];
-        return *this;
-    }
-}; // Vec4<T>
-
-template<typename T1, typename T2>
-__hostdev__ inline Vec4<T2> operator*(T1 scalar, const Vec4<T2>& vec)
-{
-    return Vec4<T2>(scalar * vec[0], scalar * vec[1], scalar * vec[2], scalar * vec[3]);
-}
-template<typename T1, typename T2>
-__hostdev__ inline Vec4<T2> operator/(T1 scalar, const Vec4<T2>& vec)
-{
-    return Vec4<T2>(scalar / vec[0], scalar / vec[1], scalar / vec[2], scalar / vec[3]);
-}
-
-using Vec4R = Vec4<double>;
-using Vec4d = Vec4<double>;
-using Vec4f = Vec4<float>;
-using Vec4i = Vec4<int>;
-
-
-// --------------------------> Rgba8 <------------------------------------
-
-/// @brief 8-bit red, green, blue, alpha packed into 32 bit unsigned int
-class Rgba8
-{
-    union
-    {
-        uint8_t  c[4];   // 4 integer color channels of red, green, blue and alpha components.
-        uint32_t packed; // 32 bit packed representation
-    } mData;
-
-public:
-    static const int SIZE = 4;
-    using ValueType = uint8_t;
-
-    /// @brief Default copy constructor
-    Rgba8(const Rgba8&) = default;
-
-    /// @brief Default move constructor
-    Rgba8(Rgba8&&) = default;
-
-    /// @brief Default move assignment operator
-    /// @return non-const reference to this instance
-    Rgba8&      operator=(Rgba8&&) = default;
-
-    /// @brief Default copy assignment operator
-    /// @return non-const reference to this instance
-    Rgba8&      operator=(const Rgba8&) = default;
-
-    /// @brief Default ctor initializes all channels to zero
-    __hostdev__ Rgba8()
-        : mData{{0, 0, 0, 0}}
-    {
-        static_assert(sizeof(uint32_t) == sizeof(Rgba8), "Unexpected sizeof");
-    }
-
-    /// @brief integer r,g,b,a ctor where alpha channel defaults to opaque
-    /// @note all values should be in the range 0u to 255u
-    __hostdev__ Rgba8(uint8_t r, uint8_t g, uint8_t b, uint8_t a = 255u)
-        : mData{{r, g, b, a}}
-    {
-    }
-
-    /// @brief  @brief ctor where all channels are initialized to the same value
-    /// @note value should be in the range 0u to 255u
-    explicit __hostdev__ Rgba8(uint8_t v)
-        : mData{{v, v, v, v}}
-    {
-    }
-
-    /// @brief floating-point r,g,b,a ctor where alpha channel defaults to opaque
-    /// @note all values should be in the range 0.0f to 1.0f
-    __hostdev__ Rgba8(float r, float g, float b, float a = 1.0f)
-        : mData{{static_cast<uint8_t>(0.5f + r * 255.0f), // round floats to nearest integers
-                 static_cast<uint8_t>(0.5f + g * 255.0f), // double {{}} is needed due to union
-                 static_cast<uint8_t>(0.5f + b * 255.0f),
-                 static_cast<uint8_t>(0.5f + a * 255.0f)}}
-    {
-    }
-
-    /// @brief Vec3f r,g,b ctor (alpha channel it set to 1)
-    /// @note all values should be in the range 0.0f to 1.0f
-    __hostdev__ Rgba8(const Vec3f& rgb)
-        : Rgba8(rgb[0], rgb[1], rgb[2])
-    {
-    }
-
-    /// @brief Vec4f r,g,b,a ctor
-    /// @note all values should be in the range 0.0f to 1.0f
-    __hostdev__ Rgba8(const Vec4f& rgba)
-        : Rgba8(rgba[0], rgba[1], rgba[2], rgba[3])
-    {
-    }
-
-    __hostdev__ bool  operator< (const Rgba8& rhs) const { return mData.packed < rhs.mData.packed; }
-    __hostdev__ bool  operator==(const Rgba8& rhs) const { return mData.packed == rhs.mData.packed; }
-    __hostdev__ float lengthSqr() const
-    {
-        return 0.0000153787005f * (float(mData.c[0]) * mData.c[0] +
-                                   float(mData.c[1]) * mData.c[1] +
-                                   float(mData.c[2]) * mData.c[2]); //1/255^2
-    }
-    __hostdev__ float           length() const { return sqrtf(this->lengthSqr()); }
-    /// @brief return n'th color channel as a float in the range 0 to 1
-    __hostdev__ float           asFloat(int n) const { return 0.003921569f*float(mData.c[n]); }// divide by 255
-    __hostdev__ const uint8_t&  operator[](int n) const { return mData.c[n]; }
-    __hostdev__ uint8_t&        operator[](int n) { return mData.c[n]; }
-    __hostdev__ const uint32_t& packed() const { return mData.packed; }
-    __hostdev__ uint32_t&       packed() { return mData.packed; }
-    __hostdev__ const uint8_t&  r() const { return mData.c[0]; }
-    __hostdev__ const uint8_t&  g() const { return mData.c[1]; }
-    __hostdev__ const uint8_t&  b() const { return mData.c[2]; }
-    __hostdev__ const uint8_t&  a() const { return mData.c[3]; }
-    __hostdev__ uint8_t&        r() { return mData.c[0]; }
-    __hostdev__ uint8_t&        g() { return mData.c[1]; }
-    __hostdev__ uint8_t&        b() { return mData.c[2]; }
-    __hostdev__ uint8_t&        a() { return mData.c[3]; }
-    __hostdev__                 operator Vec3f() const {
-        return Vec3f(this->asFloat(0), this->asFloat(1), this->asFloat(2));
-    }
-    __hostdev__                 operator Vec4f() const {
-        return Vec4f(this->asFloat(0), this->asFloat(1), this->asFloat(2), this->asFloat(3));
-    }
-}; // Rgba8
-
-using PackedRGBA8 = Rgba8; // for backwards compatibility
-
-// ----------------------------> TensorTraits <--------------------------------------
-
-template<typename T, int Rank = (is_specialization<T, Vec3>::value || is_specialization<T, Vec4>::value || is_same<T, Rgba8>::value) ? 1 : 0>
-struct TensorTraits;
-
-template<typename T>
-struct TensorTraits<T, 0>
-{
-    static const int  Rank = 0; // i.e. scalar
-    static const bool IsScalar = true;
-    static const bool IsVector = false;
-    static const int  Size = 1;
-    using ElementType = T;
-    static T scalar(const T& s) { return s; }
-};
-
-template<typename T>
-struct TensorTraits<T, 1>
-{
-    static const int  Rank = 1; // i.e. vector
-    static const bool IsScalar = false;
-    static const bool IsVector = true;
-    static const int  Size = T::SIZE;
-    using ElementType = typename T::ValueType;
-    static ElementType scalar(const T& v) { return v.length(); }
-};
-
-// ----------------------------> FloatTraits <--------------------------------------
-
-template<typename T, int = sizeof(typename TensorTraits<T>::ElementType)>
-struct FloatTraits
-{
-    using FloatType = float;
-};
-
-template<typename T>
-struct FloatTraits<T, 8>
-{
-    using FloatType = double;
-};
-
-template<>
-struct FloatTraits<bool, 1>
-{
-    using FloatType = bool;
-};
-
-template<>
-struct FloatTraits<ValueIndex, 1> // size of empty class in C++ is 1 byte and not 0 byte
-{
-    using FloatType = uint64_t;
-};
-
-template<>
-struct FloatTraits<ValueIndexMask, 1> // size of empty class in C++ is 1 byte and not 0 byte
-{
-    using FloatType = uint64_t;
-};
-
-template<>
-struct FloatTraits<ValueOnIndex, 1> // size of empty class in C++ is 1 byte and not 0 byte
-{
-    using FloatType = uint64_t;
-};
-
-template<>
-struct FloatTraits<ValueOnIndexMask, 1> // size of empty class in C++ is 1 byte and not 0 byte
-{
-    using FloatType = uint64_t;
-};
-
-template<>
-struct FloatTraits<ValueMask, 1> // size of empty class in C++ is 1 byte and not 0 byte
-{
-    using FloatType = bool;
-};
-
-template<>
-struct FloatTraits<Point, 1> // size of empty class in C++ is 1 byte and not 0 byte
-{
-    using FloatType = double;
-};
-
-// ----------------------------> mapping BuildType -> GridType <--------------------------------------
-
-/// @brief Maps from a templated build type to a GridType enum
-template<typename BuildT>
-__hostdev__ inline GridType mapToGridType()
-{
-    if constexpr(is_same<BuildT, float>::value) { // resolved at compile-time
-        return GridType::Float;
-    } else if constexpr(is_same<BuildT, double>::value) {
-        return GridType::Double;
-    } else if constexpr(is_same<BuildT, int16_t>::value) {
-        return GridType::Int16;
-    } else if constexpr(is_same<BuildT, int32_t>::value) {
-        return GridType::Int32;
-    } else if constexpr(is_same<BuildT, int64_t>::value) {
-        return GridType::Int64;
-    } else if constexpr(is_same<BuildT, Vec3f>::value) {
-        return GridType::Vec3f;
-    } else if constexpr(is_same<BuildT, Vec3d>::value) {
-        return GridType::Vec3d;
-    } else if constexpr(is_same<BuildT, uint32_t>::value) {
-        return GridType::UInt32;
-    } else if constexpr(is_same<BuildT, ValueMask>::value) {
-        return GridType::Mask;
-    } else if constexpr(is_same<BuildT, Half>::value) {
-        return GridType::Half;
-    } else if constexpr(is_same<BuildT, ValueIndex>::value) {
-        return GridType::Index;
-    } else if constexpr(is_same<BuildT, ValueOnIndex>::value) {
-        return GridType::OnIndex;
-    } else if constexpr(is_same<BuildT, ValueIndexMask>::value) {
-        return GridType::IndexMask;
-    } else if constexpr(is_same<BuildT, ValueOnIndexMask>::value) {
-        return GridType::OnIndexMask;
-    } else if constexpr(is_same<BuildT, bool>::value) {
-        return GridType::Boolean;
-    } else if constexpr(is_same<BuildT, Rgba8>::value) {
-        return GridType::RGBA8;
-    } else if (is_same<BuildT, Fp4>::value) {
-        return GridType::Fp4;
-    } else if constexpr(is_same<BuildT, Fp8>::value) {
-        return GridType::Fp8;
-    } else if constexpr(is_same<BuildT, Fp16>::value) {
-        return GridType::Fp16;
-    } else if constexpr(is_same<BuildT, FpN>::value) {
-        return GridType::FpN;
-    } else if constexpr(is_same<BuildT, Vec4f>::value) {
-        return GridType::Vec4f;
-    } else if constexpr(is_same<BuildT, Vec4d>::value) {
-        return GridType::Vec4d;
-    } else if (is_same<BuildT, Point>::value) {
-        return GridType::PointIndex;
-    } else if constexpr(is_same<BuildT, Vec3u8>::value) {
-        return GridType::Vec3u8;
-    } else if constexpr(is_same<BuildT, Vec3u16>::value) {
-        return GridType::Vec3u16;
-    }
-    return GridType::Unknown;
-}
-
-// ----------------------------> mapping BuildType -> GridClass <--------------------------------------
-
-/// @brief Maps from a templated build type to a GridClass enum
-template<typename BuildT>
-__hostdev__ inline GridClass mapToGridClass(GridClass defaultClass = GridClass::Unknown)
-{
-    if (is_same<BuildT, ValueMask>::value) {
-        return GridClass::Topology;
-    } else if (BuildTraits<BuildT>::is_index) {
-        return GridClass::IndexGrid;
-    } else if (is_same<BuildT, Rgba8>::value) {
-        return GridClass::VoxelVolume;
-    } else if (is_same<BuildT, Point>::value) {
-        return GridClass::PointIndex;
-    }
-    return defaultClass;
-}
-
-// ----------------------------> matMult <--------------------------------------
-
-/// @brief Multiply a 3x3 matrix and a 3d vector using 32bit floating point arithmetics
-/// @note This corresponds to a linear mapping, e.g. scaling, rotation etc.
-/// @tparam Vec3T Template type of the input and output 3d vectors
-/// @param mat pointer to an array of floats with the 3x3 matrix
-/// @param xyz input vector to be multiplied by the matrix
-/// @return result of matrix-vector multiplication, i.e. mat x xyz
-template<typename Vec3T>
-__hostdev__ inline Vec3T matMult(const float* mat, const Vec3T& xyz)
-{
-    return Vec3T(fmaf(static_cast<float>(xyz[0]), mat[0], fmaf(static_cast<float>(xyz[1]), mat[1], static_cast<float>(xyz[2]) * mat[2])),
-                 fmaf(static_cast<float>(xyz[0]), mat[3], fmaf(static_cast<float>(xyz[1]), mat[4], static_cast<float>(xyz[2]) * mat[5])),
-                 fmaf(static_cast<float>(xyz[0]), mat[6], fmaf(static_cast<float>(xyz[1]), mat[7], static_cast<float>(xyz[2]) * mat[8]))); // 6 fmaf + 3 mult = 9 flops
-}
-
-/// @brief Multiply a 3x3 matrix and a 3d vector using 64bit floating point arithmetics
-/// @note This corresponds to a linear mapping, e.g. scaling, rotation etc.
-/// @tparam Vec3T Template type of the input and output 3d vectors
-/// @param mat pointer to an array of floats with the 3x3 matrix
-/// @param xyz input vector to be multiplied by the matrix
-/// @return result of matrix-vector multiplication, i.e. mat x xyz
-template<typename Vec3T>
-__hostdev__ inline Vec3T matMult(const double* mat, const Vec3T& xyz)
-{
-    return Vec3T(fma(static_cast<double>(xyz[0]), mat[0], fma(static_cast<double>(xyz[1]), mat[1], static_cast<double>(xyz[2]) * mat[2])),
-                 fma(static_cast<double>(xyz[0]), mat[3], fma(static_cast<double>(xyz[1]), mat[4], static_cast<double>(xyz[2]) * mat[5])),
-                 fma(static_cast<double>(xyz[0]), mat[6], fma(static_cast<double>(xyz[1]), mat[7], static_cast<double>(xyz[2]) * mat[8]))); // 6 fmaf + 3 mult = 9 flops
-}
-
-/// @brief Multiply a 3x3 matrix to a 3d vector and add another 3d vector using 32bit floating point arithmetics
-/// @note This corresponds to an affine transformation, i.e a linear mapping followed by a translation. e.g. scale/rotation and translation
-/// @tparam Vec3T Template type of the input and output 3d vectors
-/// @param mat pointer to an array of floats with the 3x3 matrix
-/// @param vec 3d vector to be added AFTER the matrix multiplication
-/// @param xyz input vector to be multiplied by the matrix and a translated by @c vec
-/// @return result of affine transformation, i.e. (mat x xyz) + vec
-template<typename Vec3T>
-__hostdev__ inline Vec3T matMult(const float* mat, const float* vec, const Vec3T& xyz)
-{
-    return Vec3T(fmaf(static_cast<float>(xyz[0]), mat[0], fmaf(static_cast<float>(xyz[1]), mat[1], fmaf(static_cast<float>(xyz[2]), mat[2], vec[0]))),
-                 fmaf(static_cast<float>(xyz[0]), mat[3], fmaf(static_cast<float>(xyz[1]), mat[4], fmaf(static_cast<float>(xyz[2]), mat[5], vec[1]))),
-                 fmaf(static_cast<float>(xyz[0]), mat[6], fmaf(static_cast<float>(xyz[1]), mat[7], fmaf(static_cast<float>(xyz[2]), mat[8], vec[2])))); // 9 fmaf = 9 flops
-}
-
-/// @brief Multiply a 3x3 matrix to a 3d vector and add another 3d vector using 64bit floating point arithmetics
-/// @note This corresponds to an affine transformation, i.e a linear mapping followed by a translation. e.g. scale/rotation and translation
-/// @tparam Vec3T Template type of the input and output 3d vectors
-/// @param mat pointer to an array of floats with the 3x3 matrix
-/// @param vec 3d vector to be added AFTER the matrix multiplication
-/// @param xyz input vector to be multiplied by the matrix and a translated by @c vec
-/// @return result of affine transformation, i.e. (mat x xyz) + vec
-template<typename Vec3T>
-__hostdev__ inline Vec3T matMult(const double* mat, const double* vec, const Vec3T& xyz)
-{
-    return Vec3T(fma(static_cast<double>(xyz[0]), mat[0], fma(static_cast<double>(xyz[1]), mat[1], fma(static_cast<double>(xyz[2]), mat[2], vec[0]))),
-                 fma(static_cast<double>(xyz[0]), mat[3], fma(static_cast<double>(xyz[1]), mat[4], fma(static_cast<double>(xyz[2]), mat[5], vec[1]))),
-                 fma(static_cast<double>(xyz[0]), mat[6], fma(static_cast<double>(xyz[1]), mat[7], fma(static_cast<double>(xyz[2]), mat[8], vec[2])))); // 9 fma = 9 flops
-}
-
-/// @brief Multiply the transposed of a 3x3 matrix and a 3d vector using 32bit floating point arithmetics
-/// @note This corresponds to an inverse linear mapping, e.g. inverse scaling, inverse rotation etc.
-/// @tparam Vec3T Template type of the input and output 3d vectors
-/// @param mat pointer to an array of floats with the 3x3 matrix
-/// @param xyz input vector to be multiplied by the transposed matrix
-/// @return result of matrix-vector multiplication, i.e. mat^T x xyz
-template<typename Vec3T>
-__hostdev__ inline Vec3T matMultT(const float* mat, const Vec3T& xyz)
-{
-    return Vec3T(fmaf(static_cast<float>(xyz[0]), mat[0], fmaf(static_cast<float>(xyz[1]), mat[3], static_cast<float>(xyz[2]) * mat[6])),
-                 fmaf(static_cast<float>(xyz[0]), mat[1], fmaf(static_cast<float>(xyz[1]), mat[4], static_cast<float>(xyz[2]) * mat[7])),
-                 fmaf(static_cast<float>(xyz[0]), mat[2], fmaf(static_cast<float>(xyz[1]), mat[5], static_cast<float>(xyz[2]) * mat[8]))); // 6 fmaf + 3 mult = 9 flops
-}
-
-/// @brief Multiply the transposed of a 3x3 matrix and a 3d vector using 64bit floating point arithmetics
-/// @note This corresponds to an inverse linear mapping, e.g. inverse scaling, inverse rotation etc.
-/// @tparam Vec3T Template type of the input and output 3d vectors
-/// @param mat pointer to an array of floats with the 3x3 matrix
-/// @param xyz input vector to be multiplied by the transposed matrix
-/// @return result of matrix-vector multiplication, i.e. mat^T x xyz
-template<typename Vec3T>
-__hostdev__ inline Vec3T matMultT(const double* mat, const Vec3T& xyz)
-{
-    return Vec3T(fma(static_cast<double>(xyz[0]), mat[0], fma(static_cast<double>(xyz[1]), mat[3], static_cast<double>(xyz[2]) * mat[6])),
-                 fma(static_cast<double>(xyz[0]), mat[1], fma(static_cast<double>(xyz[1]), mat[4], static_cast<double>(xyz[2]) * mat[7])),
-                 fma(static_cast<double>(xyz[0]), mat[2], fma(static_cast<double>(xyz[1]), mat[5], static_cast<double>(xyz[2]) * mat[8]))); // 6 fmaf + 3 mult = 9 flops
-}
-
-template<typename Vec3T>
-__hostdev__ inline Vec3T matMultT(const float* mat, const float* vec, const Vec3T& xyz)
-{
-    return Vec3T(fmaf(static_cast<float>(xyz[0]), mat[0], fmaf(static_cast<float>(xyz[1]), mat[3], fmaf(static_cast<float>(xyz[2]), mat[6], vec[0]))),
-                 fmaf(static_cast<float>(xyz[0]), mat[1], fmaf(static_cast<float>(xyz[1]), mat[4], fmaf(static_cast<float>(xyz[2]), mat[7], vec[1]))),
-                 fmaf(static_cast<float>(xyz[0]), mat[2], fmaf(static_cast<float>(xyz[1]), mat[5], fmaf(static_cast<float>(xyz[2]), mat[8], vec[2])))); // 9 fmaf = 9 flops
-}
-
-template<typename Vec3T>
-__hostdev__ inline Vec3T matMultT(const double* mat, const double* vec, const Vec3T& xyz)
-{
-    return Vec3T(fma(static_cast<double>(xyz[0]), mat[0], fma(static_cast<double>(xyz[1]), mat[3], fma(static_cast<double>(xyz[2]), mat[6], vec[0]))),
-                 fma(static_cast<double>(xyz[0]), mat[1], fma(static_cast<double>(xyz[1]), mat[4], fma(static_cast<double>(xyz[2]), mat[7], vec[1]))),
-                 fma(static_cast<double>(xyz[0]), mat[2], fma(static_cast<double>(xyz[1]), mat[5], fma(static_cast<double>(xyz[2]), mat[8], vec[2])))); // 9 fma = 9 flops
-}
-
-// ----------------------------> BBox <-------------------------------------
-
-// Base-class for static polymorphism (cannot be constructed directly)
-template<typename Vec3T>
-struct BaseBBox
-{
-    Vec3T                    mCoord[2];
-    __hostdev__ bool         operator==(const BaseBBox& rhs) const { return mCoord[0] == rhs.mCoord[0] && mCoord[1] == rhs.mCoord[1]; };
-    __hostdev__ bool         operator!=(const BaseBBox& rhs) const { return mCoord[0] != rhs.mCoord[0] || mCoord[1] != rhs.mCoord[1]; };
-    __hostdev__ const Vec3T& operator[](int i) const { return mCoord[i]; }
-    __hostdev__ Vec3T&       operator[](int i) { return mCoord[i]; }
-    __hostdev__ Vec3T&       min() { return mCoord[0]; }
-    __hostdev__ Vec3T&       max() { return mCoord[1]; }
-    __hostdev__ const Vec3T& min() const { return mCoord[0]; }
-    __hostdev__ const Vec3T& max() const { return mCoord[1]; }
-    __hostdev__ BaseBBox&    translate(const Vec3T& xyz)
-    {
-        mCoord[0] += xyz;
-        mCoord[1] += xyz;
-        return *this;
-    }
-    /// @brief Expand this bounding box to enclose point @c xyz.
-    __hostdev__ BaseBBox& expand(const Vec3T& xyz)
-    {
-        mCoord[0].minComponent(xyz);
-        mCoord[1].maxComponent(xyz);
-        return *this;
-    }
-
-    /// @brief Expand this bounding box to enclose the given bounding box.
-    __hostdev__ BaseBBox& expand(const BaseBBox& bbox)
-    {
-        mCoord[0].minComponent(bbox[0]);
-        mCoord[1].maxComponent(bbox[1]);
-        return *this;
-    }
-
-    /// @brief Intersect this bounding box with the given bounding box.
-    __hostdev__ BaseBBox& intersect(const BaseBBox& bbox)
-    {
-        mCoord[0].maxComponent(bbox[0]);
-        mCoord[1].minComponent(bbox[1]);
-        return *this;
-    }
-
-    //__hostdev__ BaseBBox expandBy(typename Vec3T::ValueType padding) const
-    //{
-    //    return BaseBBox(mCoord[0].offsetBy(-padding),mCoord[1].offsetBy(padding));
-    //}
-    __hostdev__ bool isInside(const Vec3T& xyz)
-    {
-        if (xyz[0] < mCoord[0][0] || xyz[1] < mCoord[0][1] || xyz[2] < mCoord[0][2])
-            return false;
-        if (xyz[0] > mCoord[1][0] || xyz[1] > mCoord[1][1] || xyz[2] > mCoord[1][2])
-            return false;
-        return true;
-    }
-
-protected:
-    __hostdev__ BaseBBox() {}
-    __hostdev__ BaseBBox(const Vec3T& min, const Vec3T& max)
-        : mCoord{min, max}
-    {
-    }
-}; // BaseBBox
-
-template<typename Vec3T, bool = is_floating_point<typename Vec3T::ValueType>::value>
-struct BBox;
-
-/// @brief Partial template specialization for floating point coordinate types.
-///
-/// @note Min is inclusive and max is exclusive. If min = max the dimension of
-///       the bounding box is zero and therefore it is also empty.
-template<typename Vec3T>
-struct BBox<Vec3T, true> : public BaseBBox<Vec3T>
-{
-    using Vec3Type = Vec3T;
-    using ValueType = typename Vec3T::ValueType;
-    static_assert(is_floating_point<ValueType>::value, "Expected a floating point coordinate type");
-    using BaseT = BaseBBox<Vec3T>;
-    using BaseT::mCoord;
-    /// @brief Default construction sets BBox to an empty bbox
-    __hostdev__ BBox()
-        : BaseT(Vec3T( Maximum<typename Vec3T::ValueType>::value()),
-                Vec3T(-Maximum<typename Vec3T::ValueType>::value()))
-    {
-    }
-    __hostdev__ BBox(const Vec3T& min, const Vec3T& max)
-        : BaseT(min, max)
-    {
-    }
-    __hostdev__ BBox(const Coord& min, const Coord& max)
-        : BaseT(Vec3T(ValueType(min[0]), ValueType(min[1]), ValueType(min[2])),
-                Vec3T(ValueType(max[0] + 1), ValueType(max[1] + 1), ValueType(max[2] + 1)))
-    {
-    }
-    __hostdev__ static BBox createCube(const Coord& min, typename Coord::ValueType dim)
-    {
-        return BBox(min, min.offsetBy(dim));
-    }
-
-    __hostdev__ BBox(const BaseBBox<Coord>& bbox)
-        : BBox(bbox[0], bbox[1])
-    {
-    }
-    __hostdev__ bool  empty() const { return mCoord[0][0] >= mCoord[1][0] ||
-                                             mCoord[0][1] >= mCoord[1][1] ||
-                                             mCoord[0][2] >= mCoord[1][2]; }
-    __hostdev__ operator bool() const { return mCoord[0][0] < mCoord[1][0] &&
-                                               mCoord[0][1] < mCoord[1][1] &&
-                                               mCoord[0][2] < mCoord[1][2]; }
-    __hostdev__ Vec3T dim() const { return *this ? this->max() - this->min() : Vec3T(0); }
-    __hostdev__ bool  isInside(const Vec3T& p) const
-    {
-        return p[0] > mCoord[0][0] && p[1] > mCoord[0][1] && p[2] > mCoord[0][2] &&
-               p[0] < mCoord[1][0] && p[1] < mCoord[1][1] && p[2] < mCoord[1][2];
-    }
-
-}; // BBox<Vec3T, true>
-
-/// @brief Partial template specialization for integer coordinate types
-///
-/// @note Both min and max are INCLUDED in the bbox so dim = max - min + 1. So,
-///       if min = max the bounding box contains exactly one point and dim = 1!
-template<typename CoordT>
-struct BBox<CoordT, false> : public BaseBBox<CoordT>
-{
-    static_assert(is_same<int, typename CoordT::ValueType>::value, "Expected \"int\" coordinate type");
-    using BaseT = BaseBBox<CoordT>;
-    using BaseT::mCoord;
-    /// @brief Iterator over the domain covered by a BBox
-    /// @details z is the fastest-moving coordinate.
-    class Iterator
-    {
-        const BBox& mBBox;
-        CoordT      mPos;
-
-    public:
-        __hostdev__ Iterator(const BBox& b)
-            : mBBox(b)
-            , mPos(b.min())
-        {
-        }
-        __hostdev__ Iterator(const BBox& b, const Coord& p)
-            : mBBox(b)
-            , mPos(p)
-        {
-        }
-        __hostdev__ Iterator& operator++()
-        {
-            if (mPos[2] < mBBox[1][2]) { // this is the most common case
-                ++mPos[2];// increment z
-            } else if (mPos[1] < mBBox[1][1]) {
-                mPos[2] = mBBox[0][2];// reset z
-                ++mPos[1];// increment y
-            } else if (mPos[0] <= mBBox[1][0]) {
-                mPos[2] = mBBox[0][2];// reset z
-                mPos[1] = mBBox[0][1];// reset y
-                ++mPos[0];// increment x
-            }
-            return *this;
-        }
-        __hostdev__ Iterator operator++(int)
-        {
-            auto tmp = *this;
-            ++(*this);
-            return tmp;
-        }
-        __hostdev__ bool operator==(const Iterator& rhs) const
-        {
-            NANOVDB_ASSERT(mBBox == rhs.mBBox);
-            return mPos == rhs.mPos;
-        }
-        __hostdev__ bool operator!=(const Iterator& rhs) const
-        {
-            NANOVDB_ASSERT(mBBox == rhs.mBBox);
-            return mPos != rhs.mPos;
-        }
-        __hostdev__ bool operator<(const Iterator& rhs) const
-        {
-            NANOVDB_ASSERT(mBBox == rhs.mBBox);
-            return mPos < rhs.mPos;
-        }
-        __hostdev__ bool operator<=(const Iterator& rhs) const
-        {
-            NANOVDB_ASSERT(mBBox == rhs.mBBox);
-            return mPos <= rhs.mPos;
-        }
-        /// @brief Return @c true if the iterator still points to a valid coordinate.
-        __hostdev__ operator bool() const { return mPos <= mBBox[1]; }
-        __hostdev__ const CoordT& operator*() const { return mPos; }
-    }; // Iterator
-    __hostdev__ Iterator begin() const { return Iterator{*this}; }
-    __hostdev__ Iterator end()   const { return Iterator{*this, CoordT(mCoord[1][0]+1, mCoord[0][1], mCoord[0][2])}; }
-    __hostdev__          BBox()
-        : BaseT(CoordT::max(), CoordT::min())
-    {
-    }
-    __hostdev__ BBox(const CoordT& min, const CoordT& max)
-        : BaseT(min, max)
-    {
-    }
+    ///         instance has a smaller major verion (is older), and a positive age if it is newer, i.e. larger.
+    __hostdev__ int age() const {return int(this->getMajor()) - int(NANOVDB_MAJOR_VERSION_NUMBER);}
+}; // Version
 
-    template<typename SplitT>
-    __hostdev__ BBox(BBox& other, const SplitT&)
-        : BaseT(other.mCoord[0], other.mCoord[1])
-    {
-        NANOVDB_ASSERT(this->is_divisible());
-        const int n = MaxIndex(this->dim());
-        mCoord[1][n] = (mCoord[0][n] + mCoord[1][n]) >> 1;
-        other.mCoord[0][n] = mCoord[1][n] + 1;
-    }
+/// @brief print the verion number to a c-string
+/// @param dst destination string of size 8 or more
+/// @param v version to be printed
+/// @return returns destination string @c dst
+__hostdev__ inline char* toStr(char *dst, const Version &v)
+{
+    return util::sprint(dst, v.getMajor(), ".",v.getMinor(), ".",v.getPatch());
+}
 
-    __hostdev__ static BBox createCube(const CoordT& min, typename CoordT::ValueType dim)
-    {
-        return BBox(min, min.offsetBy(dim - 1));
-    }
+// ----------------------------> TensorTraits <--------------------------------------
 
-    __hostdev__ static BBox createCube(typename CoordT::ValueType min, typename CoordT::ValueType max)
-    {
-        return BBox(CoordT(min), CoordT(max));
-    }
+template<typename T, int Rank = (util::is_specialization<T, math::Vec3>::value || util::is_specialization<T, math::Vec4>::value || util::is_same<T, math::Rgba8>::value) ? 1 : 0>
+struct TensorTraits;
 
-    __hostdev__ bool is_divisible() const { return mCoord[0][0] < mCoord[1][0] &&
-                                                   mCoord[0][1] < mCoord[1][1] &&
-                                                   mCoord[0][2] < mCoord[1][2]; }
-    /// @brief Return true if this bounding box is empty, e.g. uninitialized
-    __hostdev__ bool     empty() const { return mCoord[0][0] > mCoord[1][0] ||
-                                                mCoord[0][1] > mCoord[1][1] ||
-                                                mCoord[0][2] > mCoord[1][2]; }
-    /// @brief Convert this BBox to boolean true if it is not empty
-    __hostdev__ operator bool() const { return mCoord[0][0] <= mCoord[1][0] &&
-                                               mCoord[0][1] <= mCoord[1][1] &&
-                                               mCoord[0][2] <= mCoord[1][2]; }
-    __hostdev__ CoordT   dim() const { return *this ? this->max() - this->min() + Coord(1) : Coord(0); }
-    __hostdev__ uint64_t volume() const
-    {
-        auto d = this->dim();
-        return uint64_t(d[0]) * uint64_t(d[1]) * uint64_t(d[2]);
-    }
-    __hostdev__ bool isInside(const CoordT& p) const { return !(CoordT::lessThan(p, this->min()) || CoordT::lessThan(this->max(), p)); }
-    /// @brief Return @c true if the given bounding box is inside this bounding box.
-    __hostdev__ bool isInside(const BBox& b) const
-    {
-        return !(CoordT::lessThan(b.min(), this->min()) || CoordT::lessThan(this->max(), b.max()));
-    }
+template<typename T>
+struct TensorTraits<T, 0>
+{
+    static const int  Rank = 0; // i.e. scalar
+    static const bool IsScalar = true;
+    static const bool IsVector = false;
+    static const int  Size = 1;
+    using ElementType = T;
+    static T scalar(const T& s) { return s; }
+};
 
-    /// @brief Return @c true if the given bounding box overlaps with this bounding box.
-    __hostdev__ bool hasOverlap(const BBox& b) const
-    {
-        return !(CoordT::lessThan(this->max(), b.min()) || CoordT::lessThan(b.max(), this->min()));
-    }
+template<typename T>
+struct TensorTraits<T, 1>
+{
+    static const int  Rank = 1; // i.e. vector
+    static const bool IsScalar = false;
+    static const bool IsVector = true;
+    static const int  Size = T::SIZE;
+    using ElementType = typename T::ValueType;
+    static ElementType scalar(const T& v) { return v.length(); }
+};
 
-    /// @warning This converts a CoordBBox into a floating-point bounding box which implies that max += 1 !
-    template<typename RealT = double>
-    __hostdev__ BBox<Vec3<RealT>> asReal() const
-    {
-        static_assert(is_floating_point<RealT>::value, "CoordBBox::asReal: Expected a floating point coordinate");
-        return BBox<Vec3<RealT>>(Vec3<RealT>(RealT(mCoord[0][0]), RealT(mCoord[0][1]), RealT(mCoord[0][2])),
-                                 Vec3<RealT>(RealT(mCoord[1][0] + 1), RealT(mCoord[1][1] + 1), RealT(mCoord[1][2] + 1)));
-    }
-    /// @brief Return a new instance that is expanded by the specified padding.
-    __hostdev__ BBox expandBy(typename CoordT::ValueType padding) const
-    {
-        return BBox(mCoord[0].offsetBy(-padding), mCoord[1].offsetBy(padding));
-    }
+// ----------------------------> FloatTraits <--------------------------------------
 
-    /// @brief  @brief transform this coordinate bounding box by the specified map
-    /// @param map mapping of index to world coordinates
-    /// @return world bounding box
-    template<typename Map>
-    __hostdev__ BBox<Vec3d> transform(const Map& map) const
-    {
-        const Vec3d tmp = map.applyMap(Vec3d(mCoord[0][0], mCoord[0][1], mCoord[0][2]));
-        BBox<Vec3d> bbox(tmp, tmp);
-        bbox.expand(map.applyMap(Vec3d(mCoord[0][0], mCoord[0][1], mCoord[1][2])));
-        bbox.expand(map.applyMap(Vec3d(mCoord[0][0], mCoord[1][1], mCoord[0][2])));
-        bbox.expand(map.applyMap(Vec3d(mCoord[1][0], mCoord[0][1], mCoord[0][2])));
-        bbox.expand(map.applyMap(Vec3d(mCoord[1][0], mCoord[1][1], mCoord[0][2])));
-        bbox.expand(map.applyMap(Vec3d(mCoord[1][0], mCoord[0][1], mCoord[1][2])));
-        bbox.expand(map.applyMap(Vec3d(mCoord[0][0], mCoord[1][1], mCoord[1][2])));
-        bbox.expand(map.applyMap(Vec3d(mCoord[1][0], mCoord[1][1], mCoord[1][2])));
-        return bbox;
-    }
+template<typename T, int = sizeof(typename TensorTraits<T>::ElementType)>
+struct FloatTraits
+{
+    using FloatType = float;
+};
 
-#if defined(__CUDACC__) // the following functions only run on the GPU!
-    __device__ inline BBox& expandAtomic(const CoordT& ijk)
-    {
-        mCoord[0].minComponentAtomic(ijk);
-        mCoord[1].maxComponentAtomic(ijk);
-        return *this;
-    }
-    __device__ inline BBox& expandAtomic(const BBox& bbox)
-    {
-        mCoord[0].minComponentAtomic(bbox[0]);
-        mCoord[1].maxComponentAtomic(bbox[1]);
-        return *this;
-    }
-    __device__ inline BBox& intersectAtomic(const BBox& bbox)
-    {
-        mCoord[0].maxComponentAtomic(bbox[0]);
-        mCoord[1].minComponentAtomic(bbox[1]);
-        return *this;
-    }
-#endif
-}; // BBox<CoordT, false>
+template<typename T>
+struct FloatTraits<T, 8>
+{
+    using FloatType = double;
+};
 
-using CoordBBox = BBox<Coord>;
-using BBoxR = BBox<Vec3d>;
+template<>
+struct FloatTraits<bool, 1>
+{
+    using FloatType = bool;
+};
 
-// -------------------> Find lowest and highest bit in a word <----------------------------
+template<>
+struct FloatTraits<ValueIndex, 1> // size of empty class in C++ is 1 byte and not 0 byte
+{
+    using FloatType = uint64_t;
+};
 
-/// @brief Returns the index of the lowest, i.e. least significant, on bit in the specified 32 bit word
-///
-/// @warning Assumes that at least one bit is set in the word, i.e. @a v != uint32_t(0)!
-NANOVDB_HOSTDEV_DISABLE_WARNING
-__hostdev__ static inline uint32_t FindLowestOn(uint32_t v)
+template<>
+struct FloatTraits<ValueIndexMask, 1> // size of empty class in C++ is 1 byte and not 0 byte
 {
-    NANOVDB_ASSERT(v);
-#if (defined(__CUDA_ARCH__) || defined(__HIP__)) && defined(NANOVDB_USE_INTRINSICS)
-    return __ffs(v) - 1; // one based indexing
-#elif defined(_MSC_VER) && defined(NANOVDB_USE_INTRINSICS)
-    unsigned long index;
-    _BitScanForward(&index, v);
-    return static_cast<uint32_t>(index);
-#elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS)
-    return static_cast<uint32_t>(__builtin_ctzl(v));
-#else
-    //NANO_WARNING("Using software implementation for FindLowestOn(uint32_t v)")
-    static const unsigned char DeBruijn[32] = {
-        0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9};
-// disable unary minus on unsigned warning
-#if defined(_MSC_VER) && !defined(__NVCC__)
-#pragma warning(push)
-#pragma warning(disable : 4146)
-#endif
-    return DeBruijn[uint32_t((v & -v) * 0x077CB531U) >> 27];
-#if defined(_MSC_VER) && !defined(__NVCC__)
-#pragma warning(pop)
-#endif
+    using FloatType = uint64_t;
+};
 
-#endif
-}
+template<>
+struct FloatTraits<ValueOnIndex, 1> // size of empty class in C++ is 1 byte and not 0 byte
+{
+    using FloatType = uint64_t;
+};
 
-/// @brief Returns the index of the highest, i.e. most significant, on bit in the specified 32 bit word
-///
-/// @warning Assumes that at least one bit is set in the word, i.e. @a v != uint32_t(0)!
-NANOVDB_HOSTDEV_DISABLE_WARNING
-__hostdev__ static inline uint32_t FindHighestOn(uint32_t v)
+template<>
+struct FloatTraits<ValueOnIndexMask, 1> // size of empty class in C++ is 1 byte and not 0 byte
 {
-    NANOVDB_ASSERT(v);
-#if (defined(__CUDA_ARCH__) || defined(__HIP__)) && defined(NANOVDB_USE_INTRINSICS)
-    return sizeof(uint32_t) * 8 - 1 - __clz(v); // Return the number of consecutive high-order zero bits in a 32-bit integer.
-#elif defined(_MSC_VER) && defined(NANOVDB_USE_INTRINSICS)
-    unsigned long index;
-    _BitScanReverse(&index, v);
-    return static_cast<uint32_t>(index);
-#elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS)
-    return sizeof(unsigned long) * 8 - 1 - __builtin_clzl(v);
-#else
-    //NANO_WARNING("Using software implementation for FindHighestOn(uint32_t)")
-    static const unsigned char DeBruijn[32] = {
-        0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
-        8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31};
-    v |= v >> 1; // first round down to one less than a power of 2
-    v |= v >> 2;
-    v |= v >> 4;
-    v |= v >> 8;
-    v |= v >> 16;
-    return DeBruijn[uint32_t(v * 0x07C4ACDDU) >> 27];
-#endif
-}
+    using FloatType = uint64_t;
+};
 
-/// @brief Returns the index of the lowest, i.e. least significant, on bit in the specified 64 bit word
-///
-/// @warning Assumes that at least one bit is set in the word, i.e. @a v != uint32_t(0)!
-NANOVDB_HOSTDEV_DISABLE_WARNING
-__hostdev__ static inline uint32_t FindLowestOn(uint64_t v)
+template<>
+struct FloatTraits<ValueMask, 1> // size of empty class in C++ is 1 byte and not 0 byte
 {
-    NANOVDB_ASSERT(v);
-#if (defined(__CUDA_ARCH__) || defined(__HIP__)) && defined(NANOVDB_USE_INTRINSICS)
-    return __ffsll(static_cast<unsigned long long int>(v)) - 1; // one based indexing
-#elif defined(_MSC_VER) && defined(NANOVDB_USE_INTRINSICS)
-    unsigned long index;
-    _BitScanForward64(&index, v);
-    return static_cast<uint32_t>(index);
-#elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS)
-    return static_cast<uint32_t>(__builtin_ctzll(v));
-#else
-    //NANO_WARNING("Using software implementation for FindLowestOn(uint64_t)")
-    static const unsigned char DeBruijn[64] = {
-        0,   1,  2, 53,  3,  7, 54, 27, 4,  38, 41,  8, 34, 55, 48, 28,
-        62,  5, 39, 46, 44, 42, 22,  9, 24, 35, 59, 56, 49, 18, 29, 11,
-        63, 52,  6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10,
-        51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12,
-    };
-// disable unary minus on unsigned warning
-#if defined(_MSC_VER) && !defined(__NVCC__)
-#pragma warning(push)
-#pragma warning(disable : 4146)
-#endif
-    return DeBruijn[uint64_t((v & -v) * UINT64_C(0x022FDD63CC95386D)) >> 58];
-#if defined(_MSC_VER) && !defined(__NVCC__)
-#pragma warning(pop)
-#endif
+    using FloatType = bool;
+};
 
-#endif
-}
+template<>
+struct FloatTraits<Point, 1> // size of empty class in C++ is 1 byte and not 0 byte
+{
+    using FloatType = double;
+};
 
-/// @brief Returns the index of the highest, i.e. most significant, on bit in the specified 64 bit word
-///
-/// @warning Assumes that at least one bit is set in the word, i.e. @a v != uint32_t(0)!
-NANOVDB_HOSTDEV_DISABLE_WARNING
-__hostdev__ static inline uint32_t FindHighestOn(uint64_t v)
+// ----------------------------> mapping BuildType -> GridType <--------------------------------------
+
+/// @brief Maps from a templated build type to a GridType enum
+template<typename BuildT>
+__hostdev__ inline GridType toGridType()
 {
-    NANOVDB_ASSERT(v);
-#if (defined(__CUDA_ARCH__) || defined(__HIP__)) && defined(NANOVDB_USE_INTRINSICS)
-    return sizeof(unsigned long) * 8 - 1 - __clzll(static_cast<unsigned long long int>(v));
-#elif defined(_MSC_VER) && defined(NANOVDB_USE_INTRINSICS)
-    unsigned long index;
-    _BitScanReverse64(&index, v);
-    return static_cast<uint32_t>(index);
-#elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS)
-    return sizeof(unsigned long) * 8 - 1 - __builtin_clzll(v);
-#else
-    const uint32_t* p = reinterpret_cast<const uint32_t*>(&v);
-    return p[1] ? 32u + FindHighestOn(p[1]) : FindHighestOn(p[0]);
-#endif
-}
+    if constexpr(util::is_same<BuildT, float>::value) { // resolved at compile-time
+        return GridType::Float;
+    } else if constexpr(util::is_same<BuildT, double>::value) {
+        return GridType::Double;
+    } else if constexpr(util::is_same<BuildT, int16_t>::value) {
+        return GridType::Int16;
+    } else if constexpr(util::is_same<BuildT, int32_t>::value) {
+        return GridType::Int32;
+    } else if constexpr(util::is_same<BuildT, int64_t>::value) {
+        return GridType::Int64;
+    } else if constexpr(util::is_same<BuildT, Vec3f>::value) {
+        return GridType::Vec3f;
+    } else if constexpr(util::is_same<BuildT, Vec3d>::value) {
+        return GridType::Vec3d;
+    } else if constexpr(util::is_same<BuildT, uint32_t>::value) {
+        return GridType::UInt32;
+    } else if constexpr(util::is_same<BuildT, ValueMask>::value) {
+        return GridType::Mask;
+    } else if constexpr(util::is_same<BuildT, Half>::value) {
+        return GridType::Half;
+    } else if constexpr(util::is_same<BuildT, ValueIndex>::value) {
+        return GridType::Index;
+    } else if constexpr(util::is_same<BuildT, ValueOnIndex>::value) {
+        return GridType::OnIndex;
+    } else if constexpr(util::is_same<BuildT, ValueIndexMask>::value) {
+        return GridType::IndexMask;
+    } else if constexpr(util::is_same<BuildT, ValueOnIndexMask>::value) {
+        return GridType::OnIndexMask;
+    } else if constexpr(util::is_same<BuildT, bool>::value) {
+        return GridType::Boolean;
+    } else if constexpr(util::is_same<BuildT, math::Rgba8>::value) {
+        return GridType::RGBA8;
+    } else if constexpr(util::is_same<BuildT, Fp4>::value) {
+        return GridType::Fp4;
+    } else if constexpr(util::is_same<BuildT, Fp8>::value) {
+        return GridType::Fp8;
+    } else if constexpr(util::is_same<BuildT, Fp16>::value) {
+        return GridType::Fp16;
+    } else if constexpr(util::is_same<BuildT, FpN>::value) {
+        return GridType::FpN;
+    } else if constexpr(util::is_same<BuildT, Vec4f>::value) {
+        return GridType::Vec4f;
+    } else if constexpr(util::is_same<BuildT, Vec4d>::value) {
+        return GridType::Vec4d;
+    } else if constexpr(util::is_same<BuildT, Point>::value) {
+        return GridType::PointIndex;
+    } else if constexpr(util::is_same<BuildT, Vec3u8>::value) {
+        return GridType::Vec3u8;
+    } else if constexpr(util::is_same<BuildT, Vec3u16>::value) {
+        return GridType::Vec3u16;
+    } else if constexpr(util::is_same<BuildT, uint8_t>::value) {
+        return GridType::UInt8;
+    }
+    return GridType::Unknown;
+}// toGridType
 
-// ----------------------------> CountOn <--------------------------------------
+template<typename BuildT>
+[[deprecated("Use toGridType<T>() instead.")]]
+__hostdev__ inline GridType mapToGridType(){return toGridType<BuildT>();}
+
+// ----------------------------> mapping BuildType -> GridClass <--------------------------------------
 
-/// @return Number of bits that are on in the specified 64-bit word
-NANOVDB_HOSTDEV_DISABLE_WARNING
-__hostdev__ inline uint32_t CountOn(uint64_t v)
+/// @brief Maps from a templated build type to a GridClass enum
+template<typename BuildT>
+__hostdev__ inline GridClass toGridClass(GridClass defaultClass = GridClass::Unknown)
 {
-#if (defined(__CUDA_ARCH__) || defined(__HIP__)) && defined(NANOVDB_USE_INTRINSICS)
-    //#warning Using popcll for CountOn
-    return __popcll(v);
-// __popcnt64 intrinsic support was added in VS 2019 16.8
-#elif defined(_MSC_VER) && defined(_M_X64) && (_MSC_VER >= 1928) && defined(NANOVDB_USE_INTRINSICS)
-    //#warning Using popcnt64 for CountOn
-    return uint32_t(__popcnt64(v));
-#elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS)
-    //#warning Using builtin_popcountll for CountOn
-    return __builtin_popcountll(v);
-#else // use software implementation
-    //NANO_WARNING("Using software implementation for CountOn")
-    v = v - ((v >> 1) & uint64_t(0x5555555555555555));
-    v = (v & uint64_t(0x3333333333333333)) + ((v >> 2) & uint64_t(0x3333333333333333));
-    return (((v + (v >> 4)) & uint64_t(0xF0F0F0F0F0F0F0F)) * uint64_t(0x101010101010101)) >> 56;
-#endif
+    if constexpr(util::is_same<BuildT, ValueMask>::value) {
+        return GridClass::Topology;
+    } else if constexpr(BuildTraits<BuildT>::is_index) {
+        return GridClass::IndexGrid;
+    } else if constexpr(util::is_same<BuildT, math::Rgba8>::value) {
+        return GridClass::VoxelVolume;
+    } else if constexpr(util::is_same<BuildT, Point>::value) {
+        return GridClass::PointIndex;
+    }
+    return defaultClass;
+}
+
+template<typename BuildT>
+[[deprecated("Use toGridClass<T>() instead.")]]
+__hostdev__ inline GridClass mapToGridClass(GridClass defaultClass = GridClass::Unknown)
+{
+    return toGridClass<BuildT>();
 }
 
 //  ----------------------------> BitFlags <--------------------------------------
@@ -2694,31 +930,28 @@ class BitFlags : public BitArray<N>
 public:
     using Type = decltype(mFlags);
     BitFlags() {}
+    BitFlags(Type mask) : BitArray<N>{mask} {}
     BitFlags(std::initializer_list<uint8_t> list)
     {
-        for (auto bit : list)
-            mFlags |= static_cast<Type>(1 << bit);
+        for (auto bit : list) mFlags |= static_cast<Type>(1 << bit);
     }
     template<typename MaskT>
     BitFlags(std::initializer_list<MaskT> list)
     {
-        for (auto mask : list)
-            mFlags |= static_cast<Type>(mask);
+        for (auto mask : list) mFlags |= static_cast<Type>(mask);
     }
     __hostdev__ Type  data() const { return mFlags; }
     __hostdev__ Type& data() { return mFlags; }
     __hostdev__ void  initBit(std::initializer_list<uint8_t> list)
     {
         mFlags = 0u;
-        for (auto bit : list)
-            mFlags |= static_cast<Type>(1 << bit);
+        for (auto bit : list) mFlags |= static_cast<Type>(1 << bit);
     }
     template<typename MaskT>
     __hostdev__ void initMask(std::initializer_list<MaskT> list)
     {
         mFlags = 0u;
-        for (auto mask : list)
-            mFlags |= static_cast<Type>(mask);
+        for (auto mask : list) mFlags |= static_cast<Type>(mask);
     }
     //__hostdev__ Type& data() { return mFlags; }
     //__hostdev__ Type data() const { return mFlags; }
@@ -2732,13 +965,11 @@ class BitFlags : public BitArray<N>
 
     __hostdev__ void setBitOn(std::initializer_list<uint8_t> list)
     {
-        for (auto bit : list)
-            mFlags |= static_cast<Type>(1 << bit);
+        for (auto bit : list) mFlags |= static_cast<Type>(1 << bit);
     }
     __hostdev__ void setBitOff(std::initializer_list<uint8_t> list)
     {
-        for (auto bit : list)
-            mFlags &= ~static_cast<Type>(1 << bit);
+        for (auto bit : list) mFlags &= ~static_cast<Type>(1 << bit);
     }
 
     template<typename MaskT>
@@ -2749,14 +980,12 @@ class BitFlags : public BitArray<N>
     template<typename MaskT>
     __hostdev__ void setMaskOn(std::initializer_list<MaskT> list)
     {
-        for (auto mask : list)
-            mFlags |= static_cast<Type>(mask);
+        for (auto mask : list) mFlags |= static_cast<Type>(mask);
     }
     template<typename MaskT>
     __hostdev__ void setMaskOff(std::initializer_list<MaskT> list)
     {
-        for (auto mask : list)
-            mFlags &= ~static_cast<Type>(mask);
+        for (auto mask : list) mFlags &= ~static_cast<Type>(mask);
     }
 
     __hostdev__ void setBit(uint8_t bit, bool on) { on ? this->setBitOn(bit) : this->setBitOff(bit); }
@@ -2775,18 +1004,18 @@ class BitFlags : public BitArray<N>
     template<typename MaskT>
     __hostdev__ bool isMaskOn(std::initializer_list<MaskT> list) const
     {
-        for (auto mask : list)
-            if (0 != (mFlags & static_cast<Type>(mask)))
-                return true;
+        for (auto mask : list) {
+            if (0 != (mFlags & static_cast<Type>(mask))) return true;
+        }
         return false;
     }
     /// @brief return true if any of the masks in the list are off
     template<typename MaskT>
     __hostdev__ bool isMaskOff(std::initializer_list<MaskT> list) const
     {
-        for (auto mask : list)
-            if (0 == (mFlags & static_cast<Type>(mask)))
-                return true;
+        for (auto mask : list) {
+            if (0 == (mFlags & static_cast<Type>(mask))) return true;
+        }
         return false;
     }
     /// @brief required for backwards compatibility
@@ -2822,16 +1051,16 @@ class Mask
     {
         uint32_t sum = 0;
         for (const uint64_t *w = mWords, *q = w + WORD_COUNT; w != q; ++w)
-            sum += CountOn(*w);
+            sum += util::countOn(*w);
         return sum;
     }
 
     /// @brief Return the number of lower set bits in mask up to but excluding the i'th bit
     inline __hostdev__ uint32_t countOn(uint32_t i) const
     {
-        uint32_t n = i >> 6, sum = CountOn(mWords[n] & ((uint64_t(1) << (i & 63u)) - 1u));
+        uint32_t n = i >> 6, sum = util::countOn(mWords[n] & ((uint64_t(1) << (i & 63u)) - 1u));
         for (const uint64_t* w = mWords; n--; ++w)
-            sum += CountOn(*w);
+            sum += util::countOn(*w);
         return sum;
     }
 
@@ -2932,7 +1161,7 @@ class Mask
 
     /// @brief Assignment operator that works with openvdb::util::NodeMask
     template<typename MaskT = Mask>
-    __hostdev__ typename enable_if<!is_same<MaskT, Mask>::value, Mask&>::type operator=(const MaskT& other)
+    __hostdev__ typename util::enable_if<!util::is_same<MaskT, Mask>::value, Mask&>::type operator=(const MaskT& other)
     {
         static_assert(sizeof(Mask) == sizeof(MaskT), "Mismatching sizeof");
         static_assert(WORD_COUNT == MaskT::WORD_COUNT, "Mismatching word count");
@@ -2943,11 +1172,8 @@ class Mask
         return *this;
     }
 
-    __hostdev__ Mask& operator=(const Mask& other)
-    {
-        memcpy64(mWords, other.mWords, WORD_COUNT);
-        return *this;
-    }
+    //__hostdev__ Mask& operator=(const Mask& other){return *util::memcpy(this, &other);}
+    Mask& operator=(const Mask&) = default;
 
     __hostdev__ bool operator==(const Mask& other) const
     {
@@ -3019,30 +1245,26 @@ class Mask
     /// @brief Set all bits on
     __hostdev__ void setOn()
     {
-        for (uint32_t i = 0; i < WORD_COUNT; ++i)
-            mWords[i] = ~uint64_t(0);
+        for (uint32_t i = 0; i < WORD_COUNT; ++i)mWords[i] = ~uint64_t(0);
     }
 
     /// @brief Set all bits off
     __hostdev__ void setOff()
     {
-        for (uint32_t i = 0; i < WORD_COUNT; ++i)
-            mWords[i] = uint64_t(0);
+        for (uint32_t i = 0; i < WORD_COUNT; ++i) mWords[i] = uint64_t(0);
     }
 
     /// @brief Set all bits off
     __hostdev__ void set(bool on)
     {
         const uint64_t v = on ? ~uint64_t(0) : uint64_t(0);
-        for (uint32_t i = 0; i < WORD_COUNT; ++i)
-            mWords[i] = v;
+        for (uint32_t i = 0; i < WORD_COUNT; ++i) mWords[i] = v;
     }
     /// brief Toggle the state of all bits in the mask
     __hostdev__ void toggle()
     {
         uint32_t n = WORD_COUNT;
-        for (auto* w = mWords; n--; ++w)
-            *w = ~*w;
+        for (auto* w = mWords; n--; ++w) *w = ~*w;
     }
     __hostdev__ void toggle(uint32_t n) { mWords[n >> 6] ^= uint64_t(1) << (n & 63); }
 
@@ -3051,8 +1273,7 @@ class Mask
     {
         uint64_t*       w1 = mWords;
         const uint64_t* w2 = other.mWords;
-        for (uint32_t n = WORD_COUNT; n--; ++w1, ++w2)
-            *w1 &= *w2;
+        for (uint32_t n = WORD_COUNT; n--; ++w1, ++w2) *w1 &= *w2;
         return *this;
     }
     /// @brief Bitwise union
@@ -3060,8 +1281,7 @@ class Mask
     {
         uint64_t*       w1 = mWords;
         const uint64_t* w2 = other.mWords;
-        for (uint32_t n = WORD_COUNT; n--; ++w1, ++w2)
-            *w1 |= *w2;
+        for (uint32_t n = WORD_COUNT; n--; ++w1, ++w2) *w1 |= *w2;
         return *this;
     }
     /// @brief Bitwise difference
@@ -3069,8 +1289,7 @@ class Mask
     {
         uint64_t*       w1 = mWords;
         const uint64_t* w2 = other.mWords;
-        for (uint32_t n = WORD_COUNT; n--; ++w1, ++w2)
-            *w1 &= ~*w2;
+        for (uint32_t n = WORD_COUNT; n--; ++w1, ++w2) *w1 &= ~*w2;
         return *this;
     }
     /// @brief Bitwise XOR
@@ -3078,8 +1297,7 @@ class Mask
     {
         uint64_t*       w1 = mWords;
         const uint64_t* w2 = other.mWords;
-        for (uint32_t n = WORD_COUNT; n--; ++w1, ++w2)
-            *w1 ^= *w2;
+        for (uint32_t n = WORD_COUNT; n--; ++w1, ++w2) *w1 ^= *w2;
         return *this;
     }
 
@@ -3089,9 +1307,8 @@ class Mask
     {
         uint32_t        n = 0u;
         const uint64_t* w = mWords;
-        for (; n < WORD_COUNT && !(ON ? *w : ~*w); ++w, ++n)
-            ;
-        return n < WORD_COUNT ? (n << 6) + FindLowestOn(ON ? *w : ~*w) : SIZE;
+        for (; n < WORD_COUNT && !(ON ? *w : ~*w); ++w, ++n);
+        return n < WORD_COUNT ? (n << 6) + util::findLowestOn(ON ? *w : ~*w) : SIZE;
     }
 
     NANOVDB_HOSTDEV_DISABLE_WARNING
@@ -3099,16 +1316,13 @@ class Mask
     __hostdev__ uint32_t findNext(uint32_t start) const
     {
         uint32_t n = start >> 6; // initiate
-        if (n >= WORD_COUNT)
-            return SIZE; // check for out of bounds
+        if (n >= WORD_COUNT) return SIZE; // check for out of bounds
         uint32_t m = start & 63u;
         uint64_t b = ON ? mWords[n] : ~mWords[n];
-        if (b & (uint64_t(1u) << m))
-            return start; // simple case: start is on/off
+        if (b & (uint64_t(1u) << m)) return start; // simple case: start is on/off
         b &= ~uint64_t(0u) << m; // mask out lower bits
-        while (!b && ++n < WORD_COUNT)
-            b = ON ? mWords[n] : ~mWords[n]; // find next non-zero word
-        return b ? (n << 6) + FindLowestOn(b) : SIZE; // catch last word=0
+        while (!b && ++n < WORD_COUNT) b = ON ? mWords[n] : ~mWords[n]; // find next non-zero word
+        return b ? (n << 6) + util::findLowestOn(b) : SIZE; // catch last word=0
     }
 
     NANOVDB_HOSTDEV_DISABLE_WARNING
@@ -3116,16 +1330,13 @@ class Mask
     __hostdev__ uint32_t findPrev(uint32_t start) const
     {
         uint32_t n = start >> 6; // initiate
-        if (n >= WORD_COUNT)
-            return SIZE; // check for out of bounds
+        if (n >= WORD_COUNT) return SIZE; // check for out of bounds
         uint32_t m = start & 63u;
         uint64_t b = ON ? mWords[n] : ~mWords[n];
-        if (b & (uint64_t(1u) << m))
-            return start; // simple case: start is on/off
+        if (b & (uint64_t(1u) << m)) return start; // simple case: start is on/off
         b &= (uint64_t(1u) << m) - 1u; // mask out higher bits
-        while (!b && n)
-            b = ON ? mWords[--n] : ~mWords[--n]; // find previous non-zero word
-        return b ? (n << 6) + FindHighestOn(b) : SIZE; // catch first word=0
+        while (!b && n) b = ON ? mWords[--n] : ~mWords[--n]; // find previous non-zero word
+        return b ? (n << 6) + util::findHighestOn(b) : SIZE; // catch first word=0
     }
 
 private:
@@ -3148,11 +1359,11 @@ struct Map
 
     /// @brief Default constructor for the identity map
     __hostdev__ Map()
-        : mMatF{1.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f}
+        : mMatF{   1.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f}
         , mInvMatF{1.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f}
         , mVecF{0.0f, 0.0f, 0.0f}
         , mTaperF{1.0f}
-        , mMatD{1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0}
+        , mMatD{   1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0}
         , mInvMatD{1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0}
         , mVecD{0.0, 0.0, 0.0}
         , mTaperD{1.0}
@@ -3190,7 +1401,7 @@ struct Map
     /// @param ijk 3D vector to be mapped - typically floating point index coordinates
     /// @return Forward mapping for affine transformation, i.e. (mat x ijk) + translation
     template<typename Vec3T>
-    __hostdev__ Vec3T applyMap(const Vec3T& ijk) const { return matMult(mMatD, mVecD, ijk); }
+    __hostdev__ Vec3T applyMap(const Vec3T& ijk) const { return math::matMult(mMatD, mVecD, ijk); }
 
     /// @brief Apply the forward affine transformation to a vector using 32bit floating point arithmetics.
     /// @note Typically this operation is used for the scale, rotation and translation of index -> world mapping
@@ -3198,7 +1409,7 @@ struct Map
     /// @param ijk 3D vector to be mapped - typically floating point index coordinates
     /// @return Forward mapping for affine transformation, i.e. (mat x ijk) + translation
     template<typename Vec3T>
-    __hostdev__ Vec3T applyMapF(const Vec3T& ijk) const { return matMult(mMatF, mVecF, ijk); }
+    __hostdev__ Vec3T applyMapF(const Vec3T& ijk) const { return math::matMult(mMatF, mVecF, ijk); }
 
     /// @brief Apply the linear forward 3x3 transformation to an input 3d vector using 64bit floating point arithmetics,
     ///        e.g. scale and rotation WITHOUT translation.
@@ -3207,7 +1418,7 @@ struct Map
     /// @param ijk 3D vector to be mapped - typically floating point index coordinates
     /// @return linear forward 3x3 mapping of the input vector
     template<typename Vec3T>
-    __hostdev__ Vec3T applyJacobian(const Vec3T& ijk) const { return matMult(mMatD, ijk); }
+    __hostdev__ Vec3T applyJacobian(const Vec3T& ijk) const { return math::matMult(mMatD, ijk); }
 
     /// @brief Apply the linear forward 3x3 transformation to an input 3d vector using 32bit floating point arithmetics,
     ///        e.g. scale and rotation WITHOUT translation.
@@ -3216,7 +1427,7 @@ struct Map
     /// @param ijk 3D vector to be mapped - typically floating point index coordinates
     /// @return linear forward 3x3 mapping of the input vector
     template<typename Vec3T>
-    __hostdev__ Vec3T applyJacobianF(const Vec3T& ijk) const { return matMult(mMatF, ijk); }
+    __hostdev__ Vec3T applyJacobianF(const Vec3T& ijk) const { return math::matMult(mMatF, ijk); }
 
     /// @brief Apply the inverse affine mapping to a vector using 64bit floating point arithmetics.
     /// @note Typically this operation is used for the world -> index mapping
@@ -3226,7 +1437,7 @@ struct Map
     template<typename Vec3T>
     __hostdev__ Vec3T applyInverseMap(const Vec3T& xyz) const
     {
-        return matMult(mInvMatD, Vec3T(xyz[0] - mVecD[0], xyz[1] - mVecD[1], xyz[2] - mVecD[2]));
+        return math::matMult(mInvMatD, Vec3T(xyz[0] - mVecD[0], xyz[1] - mVecD[1], xyz[2] - mVecD[2]));
     }
 
     /// @brief Apply the inverse affine mapping to a vector using 32bit floating point arithmetics.
@@ -3237,7 +1448,7 @@ struct Map
     template<typename Vec3T>
     __hostdev__ Vec3T applyInverseMapF(const Vec3T& xyz) const
     {
-        return matMult(mInvMatF, Vec3T(xyz[0] - mVecF[0], xyz[1] - mVecF[1], xyz[2] - mVecF[2]));
+        return math::matMult(mInvMatF, Vec3T(xyz[0] - mVecF[0], xyz[1] - mVecF[1], xyz[2] - mVecF[2]));
     }
 
     /// @brief Apply the linear inverse 3x3 transformation to an input 3d vector using 64bit floating point arithmetics,
@@ -3247,7 +1458,7 @@ struct Map
     /// @param ijk 3D vector to be mapped - typically floating point index coordinates
     /// @return linear inverse 3x3 mapping of the input vector i.e. xyz x mat^-1
     template<typename Vec3T>
-    __hostdev__ Vec3T applyInverseJacobian(const Vec3T& xyz) const { return matMult(mInvMatD, xyz); }
+    __hostdev__ Vec3T applyInverseJacobian(const Vec3T& xyz) const { return math::matMult(mInvMatD, xyz); }
 
     /// @brief Apply the linear inverse 3x3 transformation to an input 3d vector using 32bit floating point arithmetics,
     ///        e.g. inverse scale and inverse rotation WITHOUT translation.
@@ -3256,7 +1467,7 @@ struct Map
     /// @param ijk 3D vector to be mapped - typically floating point index coordinates
     /// @return linear inverse 3x3 mapping of the input vector i.e. xyz x mat^-1
     template<typename Vec3T>
-    __hostdev__ Vec3T applyInverseJacobianF(const Vec3T& xyz) const { return matMult(mInvMatF, xyz); }
+    __hostdev__ Vec3T applyInverseJacobianF(const Vec3T& xyz) const { return math::matMult(mInvMatF, xyz); }
 
     /// @brief Apply the transposed inverse 3x3 transformation to an input 3d vector using 64bit floating point arithmetics,
     ///        e.g. inverse scale and inverse rotation WITHOUT translation.
@@ -3265,9 +1476,9 @@ struct Map
     /// @param ijk 3D vector to be mapped - typically floating point index coordinates
     /// @return linear inverse 3x3 mapping of the input vector i.e. xyz x mat^-1
     template<typename Vec3T>
-    __hostdev__ Vec3T applyIJT(const Vec3T& xyz) const { return matMultT(mInvMatD, xyz); }
+    __hostdev__ Vec3T applyIJT(const Vec3T& xyz) const { return math::matMultT(mInvMatD, xyz); }
     template<typename Vec3T>
-    __hostdev__ Vec3T applyIJTF(const Vec3T& xyz) const { return matMultT(mInvMatF, xyz); }
+    __hostdev__ Vec3T applyIJTF(const Vec3T& xyz) const { return math::matMultT(mInvMatF, xyz); }
 
     /// @brief Return a voxels size in each coordinate direction, measured at the origin
     __hostdev__ Vec3d getVoxelSize() const { return this->applyMap(Vec3d(1)) - this->applyMap(Vec3d(0)); }
@@ -3326,20 +1537,20 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) GridBlindMetaData
     // disallow copy-assignment since methods like blindData and getBlindData uses the this pointer!
     const GridBlindMetaData& operator=(const GridBlindMetaData&) = delete;
 
-    __hostdev__ void setBlindData(void* blindData) { mDataOffset = PtrDiff(blindData, this); }
+    __hostdev__ void setBlindData(void* blindData) { mDataOffset = util::PtrDiff(blindData, this); }
 
     // unsafe
-    __hostdev__ const void* blindData() const {return PtrAdd<void>(this, mDataOffset);}
+    __hostdev__ const void* blindData() const {return util::PtrAdd(this, mDataOffset);}
 
     /// @brief Get a const pointer to the blind data represented by this meta data
     /// @tparam BlindDataT Expected value type of the blind data.
-    /// @return Returns NULL if mGridType!=mapToGridType<BlindDataT>(), else a const point of type BlindDataT.
+    /// @return Returns NULL if mGridType!=toGridType<BlindDataT>(), else a const point of type BlindDataT.
     /// @note Use mDataType=Unknown if BlindDataT is a custom data type unknown to NanoVDB.
     template<typename BlindDataT>
     __hostdev__ const BlindDataT* getBlindData() const
     {
-        //if (mDataType != mapToGridType<BlindDataT>()) printf("getBlindData mismatch\n");
-        return mDataType == mapToGridType<BlindDataT>() ? PtrAdd<BlindDataT>(this, mDataOffset) : nullptr;
+        //if (mDataType != toGridType<BlindDataT>()) printf("getBlindData mismatch\n");
+        return mDataType == toGridType<BlindDataT>() ? util::PtrAdd<BlindDataT>(this, mDataOffset) : nullptr;
     }
 
     /// @brief return true if this meta data has a valid combination of semantic, class and value tags
@@ -3373,7 +1584,7 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) GridBlindMetaData
     ///       of bind data is mValueCount * mValueSize
     __hostdev__ uint64_t blindDataSize() const
     {
-        return AlignUp<NANOVDB_DATA_ALIGNMENT>(mValueCount * mValueSize);
+        return math::AlignUp<NANOVDB_DATA_ALIGNMENT>(mValueCount * mValueSize);
     }
 }; // GridBlindMetaData
 
@@ -3463,6 +1674,113 @@ struct ProbeValue;
 template<typename BuildT>
 struct GetNodeInfo;
 
+// ----------------------------> CheckMode <----------------------------------
+
+/// @brief List of different modes for computing for a checksum
+enum class CheckMode : uint32_t { Disable = 0,  // no computation
+                                  Empty   = 0,
+                                  Half    = 1,
+                                  Partial = 1,  // fast but approximate
+                                  Default = 1,  // defaults to Partial
+                                  Full    = 2,  // slow but accurate
+                                  End     = 3, // marks the end of the enum list
+                                  StrLen  = 9 + End};
+
+/// @brief Prints CheckMode enum to a c-string
+/// @param dst Destination c-string
+/// @param mode CheckMode enum to be converted to string
+/// @return destinations string @c dst
+__hostdev__ inline char* toStr(char *dst, CheckMode mode)
+{
+    switch (mode){
+        case CheckMode::Half: return util::strcpy(dst, "half");
+        case CheckMode::Full: return util::strcpy(dst, "full");
+        default: return util::strcpy(dst, "disabled");
+    }
+}
+
+// ----------------------------> Checksum <----------------------------------
+
+/// @brief Class that encapsulates two CRC32 checksums, one for the Grid, Tree and Root node meta data
+///        and one for the remaining grid nodes.
+class Checksum
+{
+    /// Three types of checksums:
+    ///   1) Empty: all 64 bits are on (used to signify a disabled or undefined checksum)
+    ///   2) Half: Upper 32 bits are on and not all of lower 32 bits are on (lower 32 bits checksum head of grid)
+    ///   3) Full: Not all of the 64 bits are one (lower 32 bits checksum head of grid and upper 32 bits checksum tail of grid)
+    union { uint32_t mCRC32[2]; uint64_t mCRC64; };// mCRC32[0] is checksum of Grid, Tree and Root, and mCRC32[1] is checksum of nodes
+
+public:
+
+    static constexpr uint32_t EMPTY32 = ~uint32_t{0};
+    static constexpr uint64_t EMPTY64 = ~uint64_t(0);
+
+    /// @brief default constructor initiates checksum to EMPTY
+    __hostdev__ Checksum() : mCRC64{EMPTY64} {}
+
+    /// @brief Constructor that allows the two 32bit checksums to be initiated explicitly
+    /// @param head Initial 32bit CRC checksum of grid, tree and root data
+    /// @param tail Initial 32bit CRC checksum of all the nodes and blind data
+    __hostdev__ Checksum(uint32_t head, uint32_t tail) :  mCRC32{head, tail} {}
+
+    /// @brief
+    /// @param checksum
+    /// @param mode
+    __hostdev__ Checksum(uint64_t checksum, CheckMode mode = CheckMode::Full) : mCRC64{mode == CheckMode::Disable ? EMPTY64 : checksum}
+    {
+        if (mode == CheckMode::Partial) mCRC32[1] = EMPTY32;
+    }
+
+    /// @brief return the 64 bit checksum of this instance
+    [[deprecated("Use Checksum::data instead.")]]
+    __hostdev__ uint64_t checksum() const { return mCRC64; }
+    [[deprecated("Use Checksum::head and Ckecksum::tail instead.")]]
+    __hostdev__ uint32_t& checksum(int i) {NANOVDB_ASSERT(i==0 || i==1); return mCRC32[i]; }
+    [[deprecated("Use Checksum::head and Ckecksum::tail instead.")]]
+    __hostdev__ uint32_t checksum(int i) const {NANOVDB_ASSERT(i==0 || i==1); return mCRC32[i]; }
+
+    __hostdev__ uint64_t  full() const { return mCRC64; }
+    __hostdev__ uint64_t& full()       { return mCRC64; }
+    __hostdev__ uint32_t  head() const { return mCRC32[0]; }
+    __hostdev__ uint32_t& head()       { return mCRC32[0]; }
+    __hostdev__ uint32_t  tail() const { return mCRC32[1]; }
+    __hostdev__ uint32_t& tail()       { return mCRC32[1]; }
+
+    /// @brief return true if the 64 bit checksum is partial, i.e. of head only
+    [[deprecated("Use Checksum::isHalf instead.")]]
+    __hostdev__ bool isPartial() const { return mCRC32[0] != EMPTY32 && mCRC32[1] == EMPTY32; }
+    __hostdev__ bool isHalf() const { return mCRC32[0] != EMPTY32 && mCRC32[1] == EMPTY32; }
+
+    /// @brief return true if the 64 bit checksum is fill, i.e. of both had and nodes
+    __hostdev__ bool isFull() const { return mCRC64 != EMPTY64 && mCRC32[1] != EMPTY32; }
+
+    /// @brief return true if the 64 bit checksum is disables (unset)
+    __hostdev__ bool isEmpty() const { return mCRC64 == EMPTY64; }
+
+    __hostdev__ void disable() { mCRC64 = EMPTY64; }
+
+    /// @brief return the mode of the 64 bit checksum
+    __hostdev__ CheckMode mode() const
+    {
+        return mCRC64    == EMPTY64 ? CheckMode::Disable :
+               mCRC32[1] == EMPTY32 ? CheckMode::Partial : CheckMode::Full;
+    }
+
+    /// @brief return true if the checksums are identical
+    /// @param rhs other Checksum
+    __hostdev__ bool operator==(const Checksum &rhs) const {return mCRC64 == rhs.mCRC64;}
+
+    /// @brief return true if the checksums are not identical
+    /// @param rhs other Checksum
+    __hostdev__ bool operator!=(const Checksum &rhs) const {return mCRC64 != rhs.mCRC64;}
+};// Checksum
+
+/// @brief Maps 64 bit checksum to CheckMode enum
+/// @param checksum 64 bit checksum with two CRC32 codes
+/// @return CheckMode enum
+__hostdev__ inline CheckMode toCheckMode(const Checksum &checksum){return checksum.mode();}
+
 // ----------------------------> Grid <--------------------------------------
 
 /*
@@ -3493,7 +1811,7 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) GridData
 { // sizeof(GridData) = 672B
     static const int MaxNameSize = 256; // due to NULL termination the maximum length is one less
     uint64_t         mMagic; // 8B (0) magic to validate it is valid grid data.
-    uint64_t         mChecksum; // 8B (8). Checksum of grid buffer.
+    Checksum         mChecksum; // 8B (8). Checksum of grid buffer.
     Version          mVersion; // 4B (16) major, minor, and patch version numbers
     BitFlags<32>     mFlags; // 4B (20). flags for grid.
     uint32_t         mGridIndex; // 4B (24). Index of this grid in the buffer
@@ -3501,21 +1819,18 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) GridData
     uint64_t         mGridSize; // 8B (32). byte count of this entire grid occupied in the buffer.
     char             mGridName[MaxNameSize]; // 256B (40)
     Map              mMap; // 264B (296). affine transformation between index and world space in both single and double precision
-    BBox<Vec3d>      mWorldBBox; // 48B (560). floating-point AABB of active values in WORLD SPACE (2 x 3 doubles)
+    Vec3dBBox        mWorldBBox; // 48B (560). floating-point AABB of active values in WORLD SPACE (2 x 3 doubles)
     Vec3d            mVoxelSize; // 24B (608). size of a voxel in world units
     GridClass        mGridClass; // 4B (632).
     GridType         mGridType; //  4B (636).
     int64_t          mBlindMetadataOffset; // 8B (640). offset to beginning of GridBlindMetaData structures that follow this grid.
     uint32_t         mBlindMetadataCount; // 4B (648). count of GridBlindMetaData structures that follow this grid.
-    uint32_t         mData0; // 4B (652)
-    uint64_t         mData1, mData2; // 2x8B (656) padding to 32 B alignment. mData1 is use for the total number of values indexed by an IndexGrid
-    /// @brief Use this method to initiate most member dat
-    __hostdev__ GridData& operator=(const GridData& other)
-    {
-        static_assert(8 * 84 == sizeof(GridData), "GridData has unexpected size");
-        memcpy64(this, &other, 84);
-        return *this;
-    }
+    uint32_t         mData0; // 4B (652) unused
+    uint64_t         mData1; // 8B (656) is use for the total number of values indexed by an IndexGrid
+    uint64_t         mData2; // 8B (664) padding to 32 B alignment
+    /// @brief Use this method to initiate most member data
+    GridData& operator=(const GridData&) = default;
+    //__hostdev__ GridData& operator=(const GridData& other){return *util::memcpy(this, &other);}
     __hostdev__ void init(std::initializer_list<GridFlags> list = {GridFlags::IsBreadthFirst},
                           uint64_t                         gridSize = 0u,
                           const Map&                       map = Map(),
@@ -3525,9 +1840,9 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) GridData
 #ifdef NANOVDB_USE_NEW_MAGIC_NUMBERS
         mMagic = NANOVDB_MAGIC_GRID;
 #else
-        mMagic = NANOVDB_MAGIC_NUMBER;
+        mMagic = NANOVDB_MAGIC_NUMB;
 #endif
-        mChecksum = ~uint64_t(0);// all 64 bits ON means checksum is disabled
+        mChecksum.disable();// all 64 bits ON means checksum is disabled
         mVersion = Version();
         mFlags.initMask(list);
         mGridIndex = 0u;
@@ -3535,7 +1850,7 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) GridData
         mGridSize = gridSize;
         mGridName[0] = '\0';
         mMap = map;
-        mWorldBBox = BBox<Vec3d>();// invalid bbox
+        mWorldBBox = Vec3dBBox();// invalid bbox
         mVoxelSize = map.getVoxelSize();
         mGridClass = gridClass;
         mGridType = gridType;
@@ -3543,12 +1858,16 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) GridData
         mBlindMetadataCount = 0u; // i.e. no blind data
         mData0 = 0u; // zero padding
         mData1 = 0u; // only used for index and point grids
-        mData2 = NANOVDB_MAGIC_GRID; // since version 32.6.0 (might be removed in the future)
+        mData2 = NANOVDB_MAGIC_GRID; // since version 32.6.0 (will change in the future)
     }
     /// @brief return true if the magic number and the version are both valid
     __hostdev__ bool isValid() const {
-        if (mMagic == NANOVDB_MAGIC_GRID || mData2 == NANOVDB_MAGIC_GRID) return true;
-        bool test = mMagic == NANOVDB_MAGIC_NUMBER;// could be GridData or io::FileHeader
+        // Before v32.6.0: toMagic(mMagic) = MagicType::NanoVDB  and mData2 was undefined
+        // For    v32.6.0: toMagic(mMagic) = MagicType::NanoVDB  and toMagic(mData2) = MagicType::NanoGrid
+        // After  v32.7.X: toMagic(mMagic) = MagicType::NanoGrid and mData2 will again be undefined
+        const MagicType magic = toMagic(mMagic);
+        if (magic == MagicType::NanoGrid || toMagic(mData2) == MagicType::NanoGrid) return true;
+        bool test = magic == MagicType::NanoVDB;// could be GridData or io::FileHeader
         if (test) test = mVersion.isCompatible();
         if (test) test = mGridCount > 0u && mGridIndex < mGridCount;
         if (test) test = mGridClass < GridClass::End && mGridType < GridType::End;
@@ -3562,12 +1881,9 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) GridData
     __hostdev__ void setStdDeviationOn(bool on = true) { mFlags.setMask(GridFlags::HasStdDeviation, on); }
     __hostdev__ bool setGridName(const char* src)
     {
-        char *dst = mGridName, *end = dst + MaxNameSize;
-        while (*src != '\0' && dst < end - 1)
-            *dst++ = *src++;
-        while (dst < end)
-            *dst++ = '\0';
-        return *src == '\0'; // returns true if input grid name is NOT longer than MaxNameSize characters
+        const bool success = (util::strncpy(mGridName, src, MaxNameSize)[MaxNameSize-1] == '\0');
+        if (!success) mGridName[MaxNameSize-1] = '\0';
+        return success; // returns true if input grid name is NOT longer than MaxNameSize characters
     }
     // Affine transformations based on double precision
     template<typename Vec3T>
@@ -3592,31 +1908,43 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) GridData
     template<typename Vec3T>
     __hostdev__ Vec3T applyIJTF(const Vec3T& xyz) const { return mMap.applyIJTF(xyz); }
 
-    // @brief Return a non-const uint8_t pointer to the tree
-    __hostdev__ uint8_t* treePtr() { return reinterpret_cast<uint8_t*>(this + 1); }// TreeData is always right after GridData
-    //__hostdev__ TreeData* treePtr() { return reinterpret_cast<TreeData*>(this + 1); }// TreeData is always right after GridData
+    // @brief Return a non-const void pointer to the tree
+    __hostdev__ void* treePtr() { return this + 1; }// TreeData is always right after GridData
 
-    // @brief Return a const uint8_t pointer to the tree
-    __hostdev__ const uint8_t* treePtr() const { return reinterpret_cast<const uint8_t*>(this + 1); }// TreeData is always right after GridData
-    //__hostdev__ const TreeData* treePtr() const { return reinterpret_cast<const TreeData*>(this + 1); }// TreeData is always right after GridData
+    // @brief Return a const void pointer to the tree
+    __hostdev__ const void* treePtr() const { return this + 1; }// TreeData is always right after GridData
 
-    /// @brief Return a non-const uint8_t pointer to the first node at @c LEVEL
-    /// @tparam LEVEL of the node. LEVEL 0 means leaf node and LEVEL 3 means root node
-    /// @warning If not nodes exist at @c LEVEL NULL is returned
+    /// @brief Return a non-const void pointer to the first node at @c LEVEL
+    /// @tparam LEVEL Level of the node. LEVEL 0 means leaf node and LEVEL 3 means root node
     template <uint32_t LEVEL>
-    __hostdev__ const uint8_t* nodePtr() const
+    __hostdev__ const void* nodePtr() const
     {
         static_assert(LEVEL >= 0 && LEVEL <= 3, "invalid LEVEL template parameter");
-        auto *treeData  = this->treePtr();
-        auto nodeOffset = *reinterpret_cast<const uint64_t*>(treeData + 8*LEVEL);// skip LEVEL uint64_t
-        return nodeOffset ? PtrAdd<uint8_t>(treeData, nodeOffset) : nullptr;
+        const void *treeData = this + 1;// TreeData is always right after GridData
+        const uint64_t nodeOffset = *util::PtrAdd<uint64_t>(treeData, 8*LEVEL);// skip LEVEL uint64_t
+        return nodeOffset ? util::PtrAdd(treeData, nodeOffset) : nullptr;
     }
 
-    /// @brief Return a non-const uint8_t pointer to the first node at @c LEVEL
+    /// @brief Return a non-const void pointer to the first node at @c LEVEL
     /// @tparam LEVEL of the node. LEVEL 0 means leaf node and LEVEL 3 means root node
     /// @warning If not nodes exist at @c LEVEL NULL is returned
     template <uint32_t LEVEL>
-    __hostdev__ uint8_t* nodePtr(){return const_cast<uint8_t*>(const_cast<const GridData*>(this)->template nodePtr<LEVEL>());}
+    __hostdev__ void* nodePtr()
+    {
+        static_assert(LEVEL >= 0 && LEVEL <= 3, "invalid LEVEL template parameter");
+        void *treeData  = this + 1;// TreeData is always right after GridData
+        const uint64_t nodeOffset = *util::PtrAdd<uint64_t>(treeData, 8*LEVEL);// skip LEVEL uint64_t
+        return nodeOffset ? util::PtrAdd(treeData, nodeOffset) : nullptr;
+    }
+
+    /// @brief Return number of nodes at @c LEVEL
+    /// @tparam Level of the node. LEVEL 0 means leaf node and LEVEL 2 means upper node
+    template <uint32_t LEVEL>
+    __hostdev__ uint32_t nodeCount() const
+    {
+        static_assert(LEVEL >= 0 && LEVEL < 3, "invalid LEVEL template parameter");
+        return *util::PtrAdd<uint32_t>(this + 1, 4*(8 + LEVEL));// TreeData is always right after GridData
+    }
 
     /// @brief Returns a const reference to the blindMetaData at the specified linear offset.
     ///
@@ -3624,7 +1952,7 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) GridData
     __hostdev__ const GridBlindMetaData* blindMetaData(uint32_t n) const
     {
         NANOVDB_ASSERT(n < mBlindMetadataCount);
-        return PtrAdd<GridBlindMetaData>(this, mBlindMetadataOffset) + n;
+        return util::PtrAdd<GridBlindMetaData>(this, mBlindMetadataOffset) + n;
     }
 
     __hostdev__ const char* gridName() const
@@ -3647,17 +1975,16 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) GridData
     __hostdev__ static uint64_t memUsage() { return sizeof(GridData); }
 
     /// @brief return AABB of active values in world space
-    __hostdev__ const BBox<Vec3d>& worldBBox() const { return mWorldBBox; }
+    __hostdev__ const Vec3dBBox& worldBBox() const { return mWorldBBox; }
 
     /// @brief return AABB of active values in index space
     __hostdev__ const CoordBBox& indexBBox() const {return *(const CoordBBox*)(this->nodePtr<3>());}
 
     /// @brief return the root table has size
-    __hostdev__ uint32_t rootTableSize() const {
-        if (const uint8_t *root = this->nodePtr<3>()) {
-            return *(const uint32_t*)(root + sizeof(CoordBBox));
-        }
-        return 0u;
+    __hostdev__ uint32_t rootTableSize() const
+    {
+        const void *root = this->nodePtr<3>();
+        return root ? *util::PtrAdd<uint32_t>(root, sizeof(CoordBBox)) : 0u;
     }
 
     /// @brief test if the grid is empty, e.i the root table has size 0
@@ -3725,14 +2052,14 @@ class Grid : public GridData
     ///
     /// @note This method is only defined for IndexGrid = NanoGrid<ValueIndex || ValueOnIndex || ValueIndexMask || ValueOnIndexMask>
     template<typename T = BuildType>
-    __hostdev__ typename enable_if<BuildTraits<T>::is_index, const uint64_t&>::type
+    __hostdev__ typename util::enable_if<BuildTraits<T>::is_index, const uint64_t&>::type
     valueCount() const { return DataType::mData1; }
 
     /// @brief  @brief Return the total number of points indexed by this PointGrid
     ///
     /// @note This method is only defined for PointGrid = NanoGrid<Point>
     template<typename T = BuildType>
-    __hostdev__ typename enable_if<is_same<T, Point>::value, const uint64_t&>::type
+    __hostdev__ typename util::enable_if<util::is_same<T, Point>::value, const uint64_t&>::type
     pointCount() const { return DataType::mData1; }
 
     /// @brief Return a const reference to the tree
@@ -3797,7 +2124,7 @@ class Grid : public GridData
     __hostdev__ Vec3T indexToWorldGradF(const Vec3T& grad) const { return DataType::applyIJTF(grad); }
 
     /// @brief Computes a AABB of active values in world space
-    //__hostdev__ const BBox<Vec3d>& worldBBox() const { return DataType::mWorldBBox; }
+    //__hostdev__ const Vec3dBBox& worldBBox() const { return DataType::mWorldBBox; }
 
     /// @brief Computes a AABB of active values in index space
     ///
@@ -3847,7 +2174,7 @@ class Grid : public GridData
     __hostdev__ const char* shortGridName() const { return DataType::mGridName; }
 
     /// @brief Return checksum of the grid buffer.
-    __hostdev__ uint64_t checksum() const { return DataType::mChecksum; }
+    __hostdev__ const Checksum& checksum() const { return DataType::mChecksum; }
 
     /// @brief Return true if this grid is empty, i.e. contains no values or nodes.
     //__hostdev__ bool isEmpty() const { return this->tree().isEmpty(); }
@@ -3865,6 +2192,7 @@ class Grid : public GridData
     ///
     /// @warning Pointer might be NULL and the linear offset is assumed to be in the valid range
     // this method is deprecated !!!!
+    [[deprecated("Use Grid::getBlindData<T>() instead.")]]
     __hostdev__ const void* blindData(uint32_t n) const
     {
         printf("\nnanovdb::Grid::blindData is unsafe and hence deprecated! Please use nanovdb::Grid::getBlindData instead.\n\n");
@@ -3925,28 +2253,32 @@ __hostdev__ int Grid<TreeT>::findBlindData(const char* name) const
 
 struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) TreeData
 { // sizeof(TreeData) == 64B
-    int64_t  mNodeOffset[4];// 32B, byte offset from this tree to first leaf, lower, upper and root node. A zero offset means no node exists
+    int64_t  mNodeOffset[4];// 32B, byte offset from this tree to first leaf, lower, upper and root node. If mNodeCount[N]=0 => mNodeOffset[N]==mNodeOffset[N+1]
     uint32_t mNodeCount[3]; // 12B, total number of nodes of type: leaf, lower internal, upper internal
     uint32_t mTileCount[3]; // 12B, total number of active tile values at the lower internal, upper internal and root node levels
     uint64_t mVoxelCount; //    8B, total number of active voxels in the root and all its child nodes.
     // No padding since it's always 32B aligned
-    __hostdev__ TreeData& operator=(const TreeData& other)
-    {
-        static_assert(8 * 8 == sizeof(TreeData), "TreeData has unexpected size");
-        memcpy64(this, &other, 8);
-        return *this;
+    //__hostdev__ TreeData& operator=(const TreeData& other){return *util::memcpy(this, &other);}
+    TreeData& operator=(const TreeData&) = default;
+    __hostdev__ void setRoot(const void* root) {
+        NANOVDB_ASSERT(root);
+        mNodeOffset[3] = util::PtrDiff(root, this);
     }
-    __hostdev__ void setRoot(const void* root) {mNodeOffset[3] = root ? PtrDiff(root, this) : 0;}
-    __hostdev__ uint8_t* getRoot() { return mNodeOffset[3] ? PtrAdd<uint8_t>(this, mNodeOffset[3]) : nullptr; }
-    __hostdev__ const uint8_t* getRoot() const { return mNodeOffset[3] ? PtrAdd<uint8_t>(this, mNodeOffset[3]) : nullptr; }
+
+    /// @brief Get a non-const void pointer to the root node (never NULL)
+    __hostdev__ void* getRoot() { return util::PtrAdd(this, mNodeOffset[3]); }
+
+    /// @brief Get a const void pointer to the root node (never NULL)
+    __hostdev__ const void* getRoot() const { return util::PtrAdd(this, mNodeOffset[3]); }
 
     template<typename NodeT>
-    __hostdev__ void setFirstNode(const NodeT* node) {mNodeOffset[NodeT::LEVEL] = node ? PtrDiff(node, this) : 0;}
+    __hostdev__ void setFirstNode(const NodeT* node) {mNodeOffset[NodeT::LEVEL] = (node ? util::PtrDiff(node, this) : 0);}
 
-    __hostdev__ bool isEmpty() const  {return  mNodeOffset[3] ? *PtrAdd<uint32_t>(this, mNodeOffset[3] + sizeof(BBox<Coord>)) == 0 : true;}
+    /// @brief Return true if the root is empty, i.e. has not child nodes or constant tiles
+    __hostdev__ bool isEmpty() const  {return  mNodeOffset[3] ? *util::PtrAdd<uint32_t>(this, mNodeOffset[3] + sizeof(CoordBBox)) == 0 : true;}
 
     /// @brief Return the index bounding box of all the active values in this tree, i.e. in all nodes of the tree
-    __hostdev__ CoordBBox bbox() const {return  mNodeOffset[3] ? *PtrAdd<CoordBBox>(this, mNodeOffset[3]) : CoordBBox();}
+    __hostdev__ CoordBBox bbox() const {return  mNodeOffset[3] ? *util::PtrAdd<CoordBBox>(this, mNodeOffset[3]) : CoordBBox();}
 
     /// @brief  return true if RootData is layout out immediately after TreeData in memory
     __hostdev__ bool isRootNext() const {return mNodeOffset[3] ? mNodeOffset[3] == sizeof(TreeData) : false; }
@@ -4009,19 +2341,9 @@ class Tree : public TreeData
     /// @brief return memory usage in bytes for the class
     __hostdev__ static uint64_t memUsage() { return sizeof(DataType); }
 
-    __hostdev__ RootT& root()
-    {
-        RootT* ptr = reinterpret_cast<RootT*>(DataType::getRoot());
-        NANOVDB_ASSERT(ptr);
-        return *ptr;
-    }
+    __hostdev__ RootT& root() {return *reinterpret_cast<RootT*>(DataType::getRoot());}
 
-    __hostdev__ const RootT& root() const
-    {
-        const RootT* ptr = reinterpret_cast<const RootT*>(DataType::getRoot());
-        NANOVDB_ASSERT(ptr);
-        return *ptr;
-    }
+    __hostdev__ const RootT& root() const {return *reinterpret_cast<const RootT*>(DataType::getRoot());}
 
     __hostdev__ AccessorType getAccessor() const { return AccessorType(this->root()); }
 
@@ -4085,8 +2407,8 @@ class Tree : public TreeData
     template<typename NodeT>
     __hostdev__ NodeT* getFirstNode()
     {
-        const int64_t offset = DataType::mNodeOffset[NodeT::LEVEL];
-        return offset ? PtrAdd<NodeT>(this, offset) : nullptr;
+        const int64_t nodeOffset = DataType::mNodeOffset[NodeT::LEVEL];
+        return nodeOffset ? util::PtrAdd<NodeT>(this, nodeOffset) : nullptr;
     }
 
     /// @brief return a const pointer to the first node of the specified type
@@ -4095,16 +2417,15 @@ class Tree : public TreeData
     template<typename NodeT>
     __hostdev__ const NodeT* getFirstNode() const
     {
-        const int64_t offset = DataType::mNodeOffset[NodeT::LEVEL];
-        return offset ? PtrAdd<NodeT>(this, offset) : nullptr;
+        const int64_t nodeOffset = DataType::mNodeOffset[NodeT::LEVEL];
+        return nodeOffset ? util::PtrAdd<NodeT>(this, nodeOffset) : nullptr;
     }
 
     /// @brief return a pointer to the first node at the specified level
     ///
     /// @warning Note it may return NULL if no nodes exist
     template<int LEVEL>
-    __hostdev__ typename NodeTrait<RootT, LEVEL>::type*
-    getFirstNode()
+    __hostdev__ typename NodeTrait<RootT, LEVEL>::type* getFirstNode()
     {
         return this->template getFirstNode<typename NodeTrait<RootT, LEVEL>::type>();
     }
@@ -4113,8 +2434,7 @@ class Tree : public TreeData
     ///
     /// @warning Note it may return NULL if no nodes exist
     template<int LEVEL>
-    __hostdev__ const typename NodeTrait<RootT, LEVEL>::type*
-    getFirstNode() const
+    __hostdev__ const typename NodeTrait<RootT, LEVEL>::type* getFirstNode() const
     {
         return this->template getFirstNode<typename NodeTrait<RootT, LEVEL>::type>();
     }
@@ -4189,8 +2509,8 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) RootData
     __hostdev__ static KeyT   CoordToKey(const CoordT& ijk) { return ijk & ~ChildT::MASK; }
     __hostdev__ static CoordT KeyToCoord(const KeyT& key) { return key; }
 #endif
-    BBox<CoordT> mBBox; // 24B. AABB of active values in index space.
-    uint32_t     mTableSize; // 4B. number of tiles and child pointers in the root node
+    math::BBox<CoordT> mBBox; // 24B. AABB of active values in index space.
+    uint32_t           mTableSize; // 4B. number of tiles and child pointers in the root node
 
     ValueT mBackground; // background value, i.e. value of any unset voxel
     ValueT mMinimum; // typically 4B, minimum of all the active values
@@ -4213,7 +2533,7 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) RootData
         {
             key = CoordToKey(k);
             state = false;
-            child = PtrDiff(ptr, data);
+            child = util::PtrDiff(ptr, data);
         }
         template<typename CoordType, typename ValueType>
         __hostdev__ void setValue(const CoordType& k, bool s, const ValueType& v)
@@ -4283,12 +2603,12 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) RootData
     __hostdev__ ChildT* getChild(const Tile* tile)
     {
         NANOVDB_ASSERT(tile->child);
-        return PtrAdd<ChildT>(this, tile->child);
+        return util::PtrAdd<ChildT>(this, tile->child);
     }
     __hostdev__ const ChildT* getChild(const Tile* tile) const
     {
         NANOVDB_ASSERT(tile->child);
-        return PtrAdd<ChildT>(this, tile->child);
+        return util::PtrAdd<ChildT>(this, tile->child);
     }
 
     __hostdev__ const ValueT& getMin() const { return mMinimum; }
@@ -4327,7 +2647,7 @@ class RootNode : public RootData<ChildT>
     using BuildType = typename DataType::BuildT; // in rare cases BuildType != ValueType, e.g. then BuildType = ValueMask and ValueType = bool
 
     using CoordType = typename ChildT::CoordType;
-    using BBoxType = BBox<CoordType>;
+    using BBoxType = math::BBox<CoordType>;
     using AccessorType = DefaultReadAccessor<BuildType>;
     using Tile = typename DataType::Tile;
     static constexpr bool FIXED_SIZE = DataType::FIXED_SIZE;
@@ -4338,8 +2658,8 @@ class RootNode : public RootData<ChildT>
     class BaseIter
     {
     protected:
-        using DataT = typename match_const<DataType, RootT>::type;
-        using TileT = typename match_const<Tile, RootT>::type;
+        using DataT = typename util::match_const<DataType, RootT>::type;
+        using TileT = typename util::match_const<Tile, RootT>::type;
         DataT*      mData;
         uint32_t    mPos, mSize;
         __hostdev__ BaseIter(DataT* data = nullptr, uint32_t n = 0)
@@ -4369,9 +2689,9 @@ class RootNode : public RootData<ChildT>
     template<typename RootT>
     class ChildIter : public BaseIter<RootT>
     {
-        static_assert(is_same<typename remove_const<RootT>::type, RootNode>::value, "Invalid RootT");
+        static_assert(util::is_same<typename util::remove_const<RootT>::type, RootNode>::value, "Invalid RootT");
         using BaseT = BaseIter<RootT>;
-        using NodeT = typename match_const<ChildT, RootT>::type;
+        using NodeT = typename util::match_const<ChildT, RootT>::type;
 
     public:
         __hostdev__ ChildIter()
@@ -4514,7 +2834,7 @@ class RootNode : public RootData<ChildT>
     class DenseIter : public BaseIter<RootT>
     {
         using BaseT = BaseIter<RootT>;
-        using NodeT = typename match_const<ChildT, RootT>::type;
+        using NodeT = typename util::match_const<ChildT, RootT>::type;
 
     public:
         __hostdev__ DenseIter()
@@ -4599,7 +2919,7 @@ class RootNode : public RootData<ChildT>
     __hostdev__ const FloatType& average() const { return DataType::mAverage; }
 
     /// @brief Return the variance of all the active values encoded in this root node and any of its child nodes
-    __hostdev__ FloatType variance() const { return Pow2(DataType::mStdDevi); }
+    __hostdev__ FloatType variance() const { return math::Pow2(DataType::mStdDevi); }
 
     /// @brief Return a const reference to the standard deviation of all the active values encoded in this root node and any of its child nodes
     __hostdev__ const FloatType& stdDeviation() const { return DataType::mStdDevi; }
@@ -4692,7 +3012,7 @@ class RootNode : public RootData<ChildT>
 
     template<typename OpT, typename... ArgsT>
     // __hostdev__ auto // occasionally fails with NVCC
-    __hostdev__ decltype(OpT::set(std::declval<Tile&>(), std::declval<ArgsT>()...))
+    __hostdev__ decltype(OpT::set(util::declval<Tile&>(), util::declval<ArgsT>()...))
     set(const CoordType& ijk, ArgsT&&... args)
     {
         if (Tile* tile = DataType::probeTile(ijk)) {
@@ -4800,7 +3120,7 @@ class RootNode : public RootData<ChildT>
     }
 
     template<typename OpT, typename AccT, typename... ArgsT>
-    //__hostdev__  decltype(OpT::get(std::declval<const Tile&>(), std::declval<ArgsT>()...))
+    //__hostdev__  decltype(OpT::get(util::declval<const Tile&>(), util::declval<ArgsT>()...))
     __hostdev__ auto
     getAndCache(const CoordType& ijk, const AccT& acc, ArgsT&&... args) const
     {
@@ -4817,7 +3137,7 @@ class RootNode : public RootData<ChildT>
 
     template<typename OpT, typename AccT, typename... ArgsT>
     // __hostdev__ auto // occasionally fails with NVCC
-    __hostdev__ decltype(OpT::set(std::declval<Tile&>(), std::declval<ArgsT>()...))
+    __hostdev__ decltype(OpT::set(util::declval<Tile&>(), util::declval<ArgsT>()...))
     setAndCache(const CoordType& ijk, const AccT& acc, ArgsT&&... args)
     {
         if (Tile* tile = DataType::probeTile(ijk)) {
@@ -4861,7 +3181,7 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) InternalData
         ~Tile() = delete;
     };
 
-    BBox<CoordT> mBBox; // 24B. node bounding box.                   |
+    math::BBox<CoordT> mBBox; // 24B. node bounding box.                   |
     uint64_t     mFlags; // 8B. node flags.                          | 32B aligned
     MaskT        mValueMask; // LOG2DIM(5): 4096B, LOG2DIM(4): 512B  | 32B aligned
     MaskT        mChildMask; // LOG2DIM(5): 4096B, LOG2DIM(4): 512B  | 32B aligned
@@ -4886,7 +3206,7 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) InternalData
     __hostdev__ void setChild(uint32_t n, const void* ptr)
     {
         NANOVDB_ASSERT(mChildMask.isOn(n));
-        mTable[n].child = PtrDiff(ptr, this);
+        mTable[n].child = util::PtrDiff(ptr, this);
     }
 
     template<typename ValueT>
@@ -4900,12 +3220,12 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) InternalData
     __hostdev__ ChildT* getChild(uint32_t n)
     {
         NANOVDB_ASSERT(mChildMask.isOn(n));
-        return PtrAdd<ChildT>(this, mTable[n].child);
+        return util::PtrAdd<ChildT>(this, mTable[n].child);
     }
     __hostdev__ const ChildT* getChild(uint32_t n) const
     {
         NANOVDB_ASSERT(mChildMask.isOn(n));
-        return PtrAdd<ChildT>(this, mTable[n].child);
+        return util::PtrAdd<ChildT>(this, mTable[n].child);
     }
 
     __hostdev__ ValueT getValue(uint32_t n) const
@@ -4949,7 +3269,7 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) InternalData
     ~InternalData() = delete;
 }; // InternalData
 
-/// @brief Internal nodes of a VDB treedim(),
+/// @brief Internal nodes of a VDB tree
 template<typename ChildT, uint32_t Log2Dim = ChildT::LOG2DIM + 1>
 class InternalNode : public InternalData<ChildT, Log2Dim>
 {
@@ -4979,9 +3299,9 @@ class InternalNode : public InternalData<ChildT, Log2Dim>
     template <typename ParentT>
     class ChildIter : public MaskIterT<true>
     {
-        static_assert(is_same<typename remove_const<ParentT>::type, InternalNode>::value, "Invalid ParentT");
+        static_assert(util::is_same<typename util::remove_const<ParentT>::type, InternalNode>::value, "Invalid ParentT");
         using BaseT = MaskIterT<true>;
-        using NodeT = typename match_const<ChildT, ParentT>::type;
+        using NodeT = typename util::match_const<ChildT, ParentT>::type;
         ParentT* mParent;
 
     public:
@@ -5181,7 +3501,7 @@ class InternalNode : public InternalData<ChildT, Log2Dim>
     __hostdev__ const FloatType& stdDeviation() const { return DataType::mStdDevi; }
 
     /// @brief Return a const reference to the bounding box in index space of active values in this internal node and any of its child nodes
-    __hostdev__ const BBox<CoordType>& bbox() const { return DataType::mBBox; }
+    __hostdev__ const math::BBox<CoordType>& bbox() const { return DataType::mBBox; }
 
     /// @brief If the first entry in this node's table is a tile, return the tile's value.
     ///        Otherwise, return the result of calling getFirstValue() on the child.
@@ -5288,7 +3608,7 @@ class InternalNode : public InternalData<ChildT, Log2Dim>
 
     template<typename OpT, typename... ArgsT>
     //__hostdev__ auto // occasionally fails with NVCC
-    __hostdev__ decltype(OpT::set(std::declval<InternalNode&>(), std::declval<uint32_t>(), std::declval<ArgsT>()...))
+    __hostdev__ decltype(OpT::set(util::declval<InternalNode&>(), util::declval<uint32_t>(), util::declval<ArgsT>()...))
     set(const CoordType& ijk, ArgsT&&... args)
     {
         const uint32_t n = CoordToOffset(ijk);
@@ -5384,7 +3704,7 @@ class InternalNode : public InternalData<ChildT, Log2Dim>
 
     template<typename OpT, typename AccT, typename... ArgsT>
     __hostdev__ auto
-    //__hostdev__  decltype(OpT::get(std::declval<const InternalNode&>(), std::declval<uint32_t>(), std::declval<ArgsT>()...))
+    //__hostdev__  decltype(OpT::get(util::declval<const InternalNode&>(), util::declval<uint32_t>(), util::declval<ArgsT>()...))
     getAndCache(const CoordType& ijk, const AccT& acc, ArgsT&&... args) const
     {
         const uint32_t n = CoordToOffset(ijk);
@@ -5397,7 +3717,7 @@ class InternalNode : public InternalData<ChildT, Log2Dim>
 
     template<typename OpT, typename AccT, typename... ArgsT>
     //__hostdev__ auto // occasionally fails with NVCC
-    __hostdev__ decltype(OpT::set(std::declval<InternalNode&>(), std::declval<uint32_t>(), std::declval<ArgsT>()...))
+    __hostdev__ decltype(OpT::set(util::declval<InternalNode&>(), util::declval<uint32_t>(), util::declval<ArgsT>()...))
     setAndCache(const CoordType& ijk, const AccT& acc, ArgsT&&... args)
     {
         const uint32_t n = CoordToOffset(ijk);
@@ -5849,6 +4169,13 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafIndexBase
     __hostdev__ void            setOn(uint32_t offset) { mValueMask.setOn(offset); }
     template<typename T>
     __hostdev__ void setOrigin(const T& ijk) { mBBoxMin = ijk; }
+
+protected:
+    /// @brief This class should be used as an abstract class and only constructed or deleted via child classes
+    LeafIndexBase() = default;
+    LeafIndexBase(const LeafIndexBase&) = default;
+    LeafIndexBase& operator=(const LeafIndexBase&) = default;
+    ~LeafIndexBase() = default;
 }; // LeafIndexBase
 
 // --------------------------> LeafData<ValueIndex> <------------------------------------
@@ -5870,12 +4197,6 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<ValueIndex, CoordT, MaskT,
     __hostdev__ uint64_t getAvg() const { return this->hasStats() ? BaseT::mOffset + 514u : 0u; }
     __hostdev__ uint64_t getDev() const { return this->hasStats() ? BaseT::mOffset + 515u : 0u; }
     __hostdev__ uint64_t getValue(uint32_t i) const { return BaseT::mOffset + i; } // dense leaf node with active and inactive voxels
-
-    /// @brief This class cannot be constructed or deleted
-    LeafData() = delete;
-    LeafData(const LeafData&) = delete;
-    LeafData& operator=(const LeafData&) = delete;
-    ~LeafData() = delete;
 }; // LeafData<ValueIndex>
 
 // --------------------------> LeafData<ValueOnIndex> <------------------------------------
@@ -5888,7 +4209,7 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<ValueOnIndex, CoordT, Mask
     using BuildType = ValueOnIndex;
     __hostdev__ uint32_t valueCount() const
     {
-        return CountOn(BaseT::mValueMask.words()[7]) + (BaseT::mPrefixSum >> 54u & 511u); // last 9 bits of mPrefixSum do not account for the last word in mValueMask
+        return util::countOn(BaseT::mValueMask.words()[7]) + (BaseT::mPrefixSum >> 54u & 511u); // last 9 bits of mPrefixSum do not account for the last word in mValueMask
     }
     __hostdev__ uint64_t lastOffset() const { return BaseT::mOffset + this->valueCount() - 1u; }
     __hostdev__ uint64_t getMin() const { return this->hasStats() ? this->lastOffset() + 1u : 0u; }
@@ -5901,16 +4222,10 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<ValueOnIndex, CoordT, Mask
         uint32_t       n = i >> 6;
         const uint64_t w = BaseT::mValueMask.words()[n], mask = uint64_t(1) << (i & 63u);
         if (!(w & mask)) return uint64_t(0); // if i'th value is inactive return offset to background value
-        uint64_t sum  = BaseT::mOffset + CountOn(w & (mask - 1u));
+        uint64_t sum  = BaseT::mOffset + util::countOn(w & (mask - 1u));
         if (n--) sum += BaseT::mPrefixSum >> (9u * n) & 511u;
         return sum;
     }
-
-    /// @brief This class cannot be constructed or deleted
-    LeafData() = delete;
-    LeafData(const LeafData&) = delete;
-    LeafData& operator=(const LeafData&) = delete;
-    ~LeafData() = delete;
 }; // LeafData<ValueOnIndex>
 
 // --------------------------> LeafData<ValueIndexMask> <------------------------------------
@@ -6147,7 +4462,7 @@ class LeafNode : public LeafData<BuildT, CoordT, MaskT, Log2Dim>
     __hostdev__ ValueIterator  beginValue()    const { return ValueIterator(this); }
     __hostdev__ ValueIterator cbeginValueAll() const { return ValueIterator(this); }
 
-    static_assert(is_same<ValueType, typename BuildToValueMap<BuildType>::Type>::value, "Mismatching BuildType");
+    static_assert(util::is_same<ValueType, typename BuildToValueMap<BuildType>::Type>::value, "Mismatching BuildType");
     static constexpr uint32_t LOG2DIM = Log2Dim;
     static constexpr uint32_t TOTAL = LOG2DIM; // needed by parent nodes
     static constexpr uint32_t DIM = 1u << TOTAL; // number of voxels along each axis of this node
@@ -6206,15 +4521,15 @@ class LeafNode : public LeafData<BuildT, CoordT, MaskT, Log2Dim>
     __hostdev__ static uint32_t dim() { return 1u << LOG2DIM; }
 
     /// @brief Return the bounding box in index space of active values in this leaf node
-    __hostdev__ BBox<CoordT> bbox() const
+    __hostdev__ math::BBox<CoordT> bbox() const
     {
-        BBox<CoordT> bbox(DataType::mBBoxMin, DataType::mBBoxMin);
+        math::BBox<CoordT> bbox(DataType::mBBoxMin, DataType::mBBoxMin);
         if (this->hasBBox()) {
             bbox.max()[0] += DataType::mBBoxDif[0];
             bbox.max()[1] += DataType::mBBoxDif[1];
             bbox.max()[2] += DataType::mBBoxDif[2];
         } else { // very rare case
-            bbox = BBox<CoordT>(); // invalid
+            bbox = math::BBox<CoordT>(); // invalid
         }
         return bbox;
     }
@@ -6364,7 +4679,7 @@ class LeafNode : public LeafData<BuildT, CoordT, MaskT, Log2Dim>
 
     template<typename OpT, typename AccT, typename... ArgsT>
     __hostdev__ auto
-    //__hostdev__  decltype(OpT::get(std::declval<const LeafNode&>(), std::declval<uint32_t>(), std::declval<ArgsT>()...))
+    //__hostdev__  decltype(OpT::get(util::declval<const LeafNode&>(), util::declval<uint32_t>(), util::declval<ArgsT>()...))
     getAndCache(const CoordType& ijk, const AccT&, ArgsT&&... args) const
     {
         return OpT::get(*this, CoordToOffset(ijk), args...);
@@ -6372,7 +4687,7 @@ class LeafNode : public LeafData<BuildT, CoordT, MaskT, Log2Dim>
 
     template<typename OpT, typename AccT, typename... ArgsT>
     //__hostdev__ auto // occasionally fails with NVCC
-    __hostdev__ decltype(OpT::set(std::declval<LeafNode&>(), std::declval<uint32_t>(), std::declval<ArgsT>()...))
+    __hostdev__ decltype(OpT::set(util::declval<LeafNode&>(), util::declval<uint32_t>(), util::declval<ArgsT>()...))
     setAndCache(const CoordType& ijk, const AccT&, ArgsT&&... args)
     {
         return OpT::set(*this, CoordToOffset(ijk), args...);
@@ -6407,12 +4722,12 @@ __hostdev__ inline bool LeafNode<ValueT, CoordT, MaskT, LOG2DIM>::updateBBox()
     }
     NANOVDB_ASSERT(word64);
     update(Xmin, Xmax, 0);
-    update(FindLowestOn(word64) >> 3, FindHighestOn(word64) >> 3, 1);
+    update(util::findLowestOn(word64) >> 3, util::findHighestOn(word64) >> 3, 1);
     const uint32_t *p = reinterpret_cast<const uint32_t*>(&word64), word32 = p[0] | p[1];
     const uint16_t *q = reinterpret_cast<const uint16_t*>(&word32), word16 = q[0] | q[1];
-    const uint8_t * b = reinterpret_cast<const uint8_t*>(&word16), byte = b[0] | b[1];
+    const uint8_t  *b = reinterpret_cast<const uint8_t*>(&word16), byte = b[0] | b[1];
     NANOVDB_ASSERT(byte);
-    update(FindLowestOn(static_cast<uint32_t>(byte)), FindHighestOn(static_cast<uint32_t>(byte)), 2);
+    update(util::findLowestOn(static_cast<uint32_t>(byte)), util::findHighestOn(static_cast<uint32_t>(byte)), 2);
     DataType::mFlags |= uint8_t(2); // set 2nd bit on, which indicates that this nodes has a bbox
     return true;
 } // LeafNode::updateBBox
@@ -6507,6 +4822,89 @@ using OnIndexGrid = Grid<OnIndexTree>;
 using IndexMaskGrid = Grid<IndexMaskTree>;
 using OnIndexMaskGrid = Grid<OnIndexMaskTree>;
 
+// --------------------------> callNanoGrid <------------------------------------
+
+/**
+* @brief Below is an example of the struct used for generic programming with callNanoGrid
+* @details For an example see "struct Crc32TailOld" in nanovdb/tools/GridChecksum.h or
+*          "struct IsNanoGridValid" in nanovdb/tools/GridValidator.h
+* @code
+*   struct OpT {
+        // define these two static functions with non-const GridData
+*       template <typename BuildT>
+*       static auto   known(      GridData *gridData, args...);
+*       static auto unknown(      GridData *gridData, args...);
+*       // or alternatively these two static functions with const GridData
+*       template <typename BuildT>
+*       static auto   known(const GridData *gridData, args...);
+*       static auto unknown(const GridData *gridData, args...);
+*   };
+*  @endcode
+*
+* @brief Here is an example of how to use callNanoGrid in client code
+* @code
+*    return callNanoGrid<OpT>(gridData, args...);
+* @endcode
+*/
+
+/// @brief Use this function, which depends a pointer to GridData, to call
+///        other functions that depend on a NanoGrid of a known ValueType.
+/// @details This function allows for generic programming by converting GridData
+///          to a NanoGrid of the type encoded in GridData::mGridType.
+template<typename OpT, typename GridDataT, typename... ArgsT>
+auto callNanoGrid(GridDataT *gridData, ArgsT&&... args)
+{
+    static_assert(util::is_same<GridDataT, GridData, const GridData>::value, "Expected gridData to be of type GridData* or const GridData*");
+    switch (gridData->mGridType){
+        case GridType::Float:
+            return OpT::template known<float>(gridData, args...);
+        case GridType::Double:
+            return OpT::template known<double>(gridData, args...);
+        case GridType::Int16:
+            return OpT::template known<int16_t>(gridData, args...);
+        case GridType::Int32:
+            return OpT::template known<int32_t>(gridData, args...);
+        case GridType::Int64:
+            return OpT::template known<int64_t>(gridData, args...);
+        case GridType::Vec3f:
+            return OpT::template known<Vec3f>(gridData, args...);
+        case GridType::Vec3d:
+            return OpT::template known<Vec3d>(gridData, args...);
+        case GridType::UInt32:
+            return OpT::template known<uint32_t>(gridData, args...);
+        case GridType::Mask:
+            return OpT::template known<ValueMask>(gridData, args...);
+        case GridType::Index:
+            return OpT::template known<ValueIndex>(gridData, args...);
+        case GridType::OnIndex:
+            return OpT::template known<ValueOnIndex>(gridData, args...);
+        case GridType::IndexMask:
+            return OpT::template known<ValueIndexMask>(gridData, args...);
+        case GridType::OnIndexMask:
+            return OpT::template known<ValueOnIndexMask>(gridData, args...);
+        case GridType::Boolean:
+            return OpT::template known<bool>(gridData, args...);
+        case GridType::RGBA8:
+            return OpT::template known<math::Rgba8>(gridData, args...);
+        case GridType::Fp4:
+            return OpT::template known<Fp4>(gridData, args...);
+        case GridType::Fp8:
+            return OpT::template known<Fp8>(gridData, args...);
+        case GridType::Fp16:
+            return OpT::template known<Fp16>(gridData, args...);
+        case GridType::FpN:
+            return OpT::template known<FpN>(gridData, args...);
+        case GridType::Vec4f:
+            return OpT::template known<Vec4f>(gridData, args...);
+        case GridType::Vec4d:
+            return OpT::template known<Vec4d>(gridData, args...);
+        case GridType::UInt8:
+            return OpT::template known<uint8_t>(gridData, args...);
+        default:
+            return OpT::unknown(gridData, args...);
+    }
+}// callNanoGrid
+
 // --------------------------> ReadAccessor <------------------------------------
 
 /// @brief A read-only value accessor with three levels of node caching. This allows for
@@ -7218,7 +5616,7 @@ class ReadAccessor<BuildT, 0, 1, 2>
     __hostdev__ const NodeT* getNode() const
     {
         using T = typename NodeTrait<TreeT, NodeT::LEVEL>::type;
-        static_assert(is_same<T, NodeT>::value, "ReadAccessor::getNode: Invalid node type");
+        static_assert(util::is_same<T, NodeT>::value, "ReadAccessor::getNode: Invalid node type");
         return reinterpret_cast<const T*>(mNode[NodeT::LEVEL]);
     }
 
@@ -7511,9 +5909,9 @@ class GridMetaData
     }
     GridMetaData(const GridData* gridData)
     {
-        static_assert(8 * 96 == sizeof(GridMetaData), "GridMetaData has unexpected size");
         if (GridMetaData::safeCast(gridData)) {
-            memcpy64(this, gridData, 96);
+            *this = *reinterpret_cast<const GridMetaData*>(gridData);
+            //util::memcpy(this, (const GridMetaData*)gridData);
         } else {// otherwise copy each member individually
             mGridData  = *gridData;
             mTreeData  = *reinterpret_cast<const TreeData*>(gridData->treePtr());
@@ -7521,6 +5919,7 @@ class GridMetaData
             mRootTableSize = gridData->rootTableSize();
         }
     }
+    GridMetaData& operator=(const GridMetaData&) = default;
     /// @brief return true if the RootData follows right after the TreeData.
     ///        If so, this implies that it's safe to cast the grid from which
     ///        this instance was constructed to a GridMetaData
@@ -7558,14 +5957,14 @@ class GridMetaData
     __hostdev__ uint32_t         gridCount() const { return mGridData.mGridCount; }
     __hostdev__ const char*      shortGridName() const { return mGridData.mGridName; }
     __hostdev__ const Map&       map() const { return mGridData.mMap; }
-    __hostdev__ const BBox<Vec3d>& worldBBox() const { return mGridData.mWorldBBox; }
-    __hostdev__ const BBox<Coord>& indexBBox() const { return mIndexBBox; }
+    __hostdev__ const Vec3dBBox& worldBBox() const { return mGridData.mWorldBBox; }
+    __hostdev__ const CoordBBox& indexBBox() const { return mIndexBBox; }
     __hostdev__ Vec3d              voxelSize() const { return mGridData.mVoxelSize; }
     __hostdev__ int                blindDataCount() const { return mGridData.mBlindMetadataCount; }
     __hostdev__ uint64_t        activeVoxelCount() const { return mTreeData.mVoxelCount; }
     __hostdev__ const uint32_t& activeTileCount(uint32_t level) const { return mTreeData.mTileCount[level - 1]; }
     __hostdev__ uint32_t        nodeCount(uint32_t level) const { return mTreeData.mNodeCount[level]; }
-    __hostdev__ uint64_t        checksum() const { return mGridData.mChecksum; }
+    __hostdev__ const Checksum& checksum() const { return mGridData.mChecksum; }
     __hostdev__ uint32_t        rootTableSize() const { return mRootTableSize; }
     __hostdev__ bool            isEmpty() const { return mRootTableSize == 0; }
     __hostdev__ Version         version() const { return mGridData.mVersion; }
@@ -7587,9 +5986,9 @@ class PointAccessor : public DefaultReadAccessor<BuildT>
         , mGrid(grid)
         , mData(grid.template getBlindData<AttT>(0))
     {
-        NANOVDB_ASSERT(grid.gridType() == mapToGridType<BuildT>());
-        NANOVDB_ASSERT((grid.gridClass() == GridClass::PointIndex && is_same<uint32_t, AttT>::value) ||
-                       (grid.gridClass() == GridClass::PointData && is_same<Vec3f, AttT>::value));
+        NANOVDB_ASSERT(grid.gridType() == toGridType<BuildT>());
+        NANOVDB_ASSERT((grid.gridClass() == GridClass::PointIndex && util::is_same<uint32_t, AttT>::value) ||
+                       (grid.gridClass() == GridClass::PointData && util::is_same<Vec3f, AttT>::value));
     }
 
     /// @brief  return true if this access was initialized correctly
@@ -7652,11 +6051,11 @@ class PointAccessor<AttT, Point> : public DefaultReadAccessor<Point>
     {
         NANOVDB_ASSERT(mData);
         NANOVDB_ASSERT(grid.gridType() == GridType::PointIndex);
-        NANOVDB_ASSERT((grid.gridClass() == GridClass::PointIndex && is_same<uint32_t, AttT>::value) ||
-                       (grid.gridClass() == GridClass::PointData && is_same<Vec3f, AttT>::value) ||
-                       (grid.gridClass() == GridClass::PointData && is_same<Vec3d, AttT>::value) ||
-                       (grid.gridClass() == GridClass::PointData && is_same<Vec3u16, AttT>::value) ||
-                       (grid.gridClass() == GridClass::PointData && is_same<Vec3u8, AttT>::value));
+        NANOVDB_ASSERT((grid.gridClass() == GridClass::PointIndex && util::is_same<uint32_t, AttT>::value) ||
+                       (grid.gridClass() == GridClass::PointData && util::is_same<Vec3f, AttT>::value) ||
+                       (grid.gridClass() == GridClass::PointData && util::is_same<Vec3d, AttT>::value) ||
+                       (grid.gridClass() == GridClass::PointData && util::is_same<Vec3u16, AttT>::value) ||
+                       (grid.gridClass() == GridClass::PointData && util::is_same<Vec3u8, AttT>::value));
     }
 
     /// @brief  return true if this access was initialized correctly
@@ -7770,16 +6169,16 @@ class ChannelAccessor : public DefaultReadAccessor<IndexT>
     }
 
     /// @brief Return the linear offset into a channel that maps to the specified coordinate
-    __hostdev__ uint64_t getIndex(const Coord& ijk) const { return BaseT::getValue(ijk); }
-    __hostdev__ uint64_t idx(int i, int j, int k) const { return BaseT::getValue(Coord(i, j, k)); }
+    __hostdev__ uint64_t getIndex(const math::Coord& ijk) const { return BaseT::getValue(ijk); }
+    __hostdev__ uint64_t idx(int i, int j, int k) const { return BaseT::getValue(math::Coord(i, j, k)); }
 
     /// @brief Return the value from a cached channel that maps to the specified coordinate
-    __hostdev__ ChannelT& getValue(const Coord& ijk) const { return mChannel[BaseT::getValue(ijk)]; }
-    __hostdev__ ChannelT& operator()(const Coord& ijk) const { return this->getValue(ijk); }
-    __hostdev__ ChannelT& operator()(int i, int j, int k) const { return this->getValue(Coord(i, j, k)); }
+    __hostdev__ ChannelT& getValue(const math::Coord& ijk) const { return mChannel[BaseT::getValue(ijk)]; }
+    __hostdev__ ChannelT& operator()(const math::Coord& ijk) const { return this->getValue(ijk); }
+    __hostdev__ ChannelT& operator()(int i, int j, int k) const { return this->getValue(math::Coord(i, j, k)); }
 
     /// @brief return the state and updates the value of the specified voxel
-    __hostdev__ bool probeValue(const Coord& ijk, typename remove_const<ChannelT>::type& v) const
+    __hostdev__ bool probeValue(const math::Coord& ijk, typename util::remove_const<ChannelT>::type& v) const
     {
         uint64_t   idx;
         const bool isActive = BaseT::probeValue(ijk, idx);
@@ -7790,14 +6189,14 @@ class ChannelAccessor : public DefaultReadAccessor<IndexT>
     ///
     /// @note The template parameter can be either const or non-const
     template<typename T>
-    __hostdev__ T& getValue(const Coord& ijk, T* channelPtr) const { return channelPtr[BaseT::getValue(ijk)]; }
+    __hostdev__ T& getValue(const math::Coord& ijk, T* channelPtr) const { return channelPtr[BaseT::getValue(ijk)]; }
 
 }; // ChannelAccessor
 
 #if 0
 // This MiniGridHandle class is only included as a stand-alone example. Note that aligned_alloc is a C++17 feature!
 // Normally we recommend using GridHandle defined in util/GridHandle.h but this minimal implementation could be an
-// alternative when using the IO medthods defined below.
+// alternative when using the IO methods defined below.
 struct MiniGridHandle {
     struct BufferType {
         uint8_t *data;
@@ -7825,7 +6224,26 @@ namespace io {
 enum class Codec : uint16_t { NONE = 0,
                               ZIP = 1,
                               BLOSC = 2,
-                              END = 3 };
+                              End = 3,
+                              StrLen = 6 + End };
+
+__hostdev__ inline const char* toStr(char *dst, Codec codec)
+{
+    switch (codec){
+        case Codec::NONE:   return util::strcpy(dst, "NONE");
+        case Codec::ZIP:    return util::strcpy(dst, "ZIP");
+        case Codec::BLOSC : return util::strcpy(dst, "BLOSC");
+        default:            return util::strcpy(dst, "END");
+    }
+}
+
+__hostdev__ inline Codec toCodec(const char *str)
+{
+    if (util::streq(str, "none"))  return Codec::NONE;
+    if (util::streq(str, "zip"))   return Codec::ZIP;
+    if (util::streq(str, "blosc")) return Codec::BLOSC;
+    return Codec::End;
+}
 
 /// @brief Data encoded at the head of each segment of a file or stream.
 ///
@@ -7836,7 +6254,7 @@ struct FileHeader {// 16 bytes
     Version  version;//   4 bytes version numbers
     uint16_t gridCount;// 2 bytes
     Codec    codec;//     2 bytes
-    bool isValid() const {return magic == NANOVDB_MAGIC_NUMBER || magic == NANOVDB_MAGIC_FILE;}
+    bool isValid() const {return magic == NANOVDB_MAGIC_NUMB || magic == NANOVDB_MAGIC_FILE;}
 }; // FileHeader ( 16 bytes = 2 words )
 
 // @brief Data encoded for each of the grids associated with a segment.
@@ -7861,7 +6279,7 @@ struct FileMetaData
     uint64_t    gridSize, fileSize, nameKey, voxelCount; // 4 * 8 = 32B.
     GridType    gridType;  // 4B.
     GridClass   gridClass; // 4B.
-    BBox<Vec3d> worldBBox; // 2 * 3 * 8 = 48B.
+    Vec3dBBox   worldBBox; // 2 * 3 * 8 = 48B.
     CoordBBox   indexBBox; // 2 * 3 * 4 = 24B.
     Vec3d       voxelSize; // 24B.
     uint32_t    nameSize;  // 4B.
@@ -7875,13 +6293,6 @@ struct FileMetaData
 // the following code block uses std and therefore needs to be ignored by CUDA and HIP
 #if !defined(__CUDA_ARCH__) && !defined(__HIP__)
 
-inline const char* toStr(Codec codec)
-{
-    static const char * LUT[] = { "NONE", "ZIP", "BLOSC" , "END" };
-    static_assert(sizeof(LUT) / sizeof(char*) - 1 == int(Codec::END), "Unexpected size of LUT");
-    return LUT[static_cast<int>(codec)];
-}
-
 // Note that starting with version 32.6.0 it is possible to write and read raw grid buffers to
 // files, e.g. os.write((const char*)&buffer.data(), buffer.size()) or more conveniently as
 // handle.write(fileName). In addition to this simple approach we offer the methods below to
@@ -7906,18 +6317,17 @@ inline const char* toStr(Codec codec)
 template<typename StreamT> // StreamT class must support: "void write(const char*, size_t)"
 void writeUncompressedGrid(StreamT& os, const GridData* gridData, bool raw = false)
 {
-    NANOVDB_ASSERT(gridData->mMagic == NANOVDB_MAGIC_NUMBER || gridData->mMagic == NANOVDB_MAGIC_GRID);
+    NANOVDB_ASSERT(gridData->mMagic == NANOVDB_MAGIC_NUMB || gridData->mMagic == NANOVDB_MAGIC_GRID);
     NANOVDB_ASSERT(gridData->mVersion.isCompatible());
     if (!raw) {// segment with a single grid:  FileHeader, FileMetaData, gridName, Grid
 #ifdef NANOVDB_USE_NEW_MAGIC_NUMBERS
         FileHeader head{NANOVDB_MAGIC_FILE, gridData->mVersion, 1u, Codec::NONE};
 #else
-        FileHeader head{NANOVDB_MAGIC_NUMBER, gridData->mVersion, 1u, Codec::NONE};
+        FileHeader head{NANOVDB_MAGIC_NUMB, gridData->mVersion, 1u, Codec::NONE};
 #endif
         const char* gridName = gridData->gridName();
-        uint32_t nameSize = 1; // '\0'
-        for (const char* p = gridName; *p != '\0'; ++p) ++nameSize;
-        const TreeData* treeData = (const TreeData*)gridData->treePtr();
+        const uint32_t nameSize = util::strlen(gridName) + 1;// include '\0'
+        const TreeData* treeData = (const TreeData*)(gridData->treePtr());
         FileMetaData meta{gridData->mGridSize, gridData->mGridSize, 0u, treeData->mVoxelCount,
                           gridData->mGridType, gridData->mGridClass, gridData->mWorldBBox,
                           treeData->bbox(), gridData->mVoxelSize, nameSize,
@@ -7986,10 +6396,12 @@ VecT<GridHandleT> readUncompressedGrids(StreamT& is, const typename GridHandleT:
                 fprintf(stderr, "nanovdb::readUncompressedGrids: invalid magic number = \"%s\"\n", (const char*)&(head.magic));
                 exit(EXIT_FAILURE);
             } else if (!head.version.isCompatible()) {
-                fprintf(stderr, "nanovdb::readUncompressedGrids: invalid major version = \"%s\"\n", head.version.c_str());
+                char str[20];
+                fprintf(stderr, "nanovdb::readUncompressedGrids: invalid major version = \"%s\"\n", toStr(str, head.version));
                 exit(EXIT_FAILURE);
             } else if (head.codec != Codec::NONE) {
-                fprintf(stderr, "nanovdb::readUncompressedGrids: invalid codec = \"%s\"\n", toStr(head.codec));
+                char str[8];
+                fprintf(stderr, "nanovdb::readUncompressedGrids: invalid codec = \"%s\"\n", toStr(str, head.codec));
                 exit(EXIT_FAILURE);
             }
             FileMetaData meta;
@@ -8041,7 +6453,7 @@ VecT<GridHandleT> readUncompressedGrids(const char* fileName, const typename Gri
 
 // ----------------------------> Implementations of random access methods <--------------------------------------
 
-/// @brief Implements Tree::getValue(Coord), i.e. return the value associated with a specific coordinate @c ijk.
+/// @brief Implements Tree::getValue(math::Coord), i.e. return the value associated with a specific coordinate @c ijk.
 /// @tparam BuildT Build type of the grid being called
 /// @details The value at a coordinate maps to the background, a tile value or a leaf value.
 template<typename BuildT>
@@ -8078,7 +6490,7 @@ struct SetVoxel
     __hostdev__ static auto set(NanoLeaf<BuildT>& leaf, uint32_t n, const ValueT& v) { leaf.mValues[n] = v; }
 }; // SetVoxel<BuildT>
 
-/// @brief Implements Tree::isActive(Coord)
+/// @brief Implements Tree::isActive(math::Coord)
 /// @tparam BuildT Build type of the grid being called
 template<typename BuildT>
 struct GetState
@@ -8090,7 +6502,7 @@ struct GetState
     __hostdev__ static auto get(const NanoLeaf<BuildT>& leaf,  uint32_t n) { return leaf.mValueMask.isOn(n); }
 }; // GetState<BuildT>
 
-/// @brief Implements Tree::getDim(Coord)
+/// @brief Implements Tree::getDim(math::Coord)
 /// @tparam BuildT Build type of the grid being called
 template<typename BuildT>
 struct GetDim
@@ -8102,7 +6514,7 @@ struct GetDim
     __hostdev__ static uint32_t get(const NanoLeaf<BuildT>&, uint32_t) { return 1u; }
 }; // GetDim<BuildT>
 
-/// @brief Return the pointer to the leaf node that contains Coord. Implements Tree::probeLeaf(Coord)
+/// @brief Return the pointer to the leaf node that contains math::Coord. Implements Tree::probeLeaf(math::Coord)
 /// @tparam BuildT Build type of the grid being called
 template<typename BuildT>
 struct GetLeaf
@@ -8114,7 +6526,7 @@ struct GetLeaf
     __hostdev__ static const NanoLeaf<BuildT>* get(const NanoLeaf<BuildT>& leaf, uint32_t) { return &leaf; }
 }; // GetLeaf<BuildT>
 
-/// @brief Return point to the lower internal node where Coord maps to one of its values, i.e. terminates
+/// @brief Return point to the lower internal node where math::Coord maps to one of its values, i.e. terminates
 /// @tparam BuildT Build type of the grid being called
 template<typename BuildT>
 struct GetLower
@@ -8126,7 +6538,7 @@ struct GetLower
     __hostdev__ static const NanoLower<BuildT>* get(const NanoLeaf<BuildT>&, uint32_t) { return nullptr; }
 }; // GetLower<BuildT>
 
-/// @brief Return point to the upper internal node where Coord maps to one of its values, i.e. terminates
+/// @brief Return point to the upper internal node where math::Coord maps to one of its values, i.e. terminates
 /// @tparam BuildT Build type of the grid being called
 template<typename BuildT>
 struct GetUpper
@@ -8138,7 +6550,7 @@ struct GetUpper
     __hostdev__ static const NanoUpper<BuildT>* get(const NanoLeaf<BuildT>&, uint32_t) { return nullptr; }
 }; // GetUpper<BuildT>
 
-/// @brief Implements Tree::probeLeaf(Coord)
+/// @brief Implements Tree::probeLeaf(math::Coord)
 /// @tparam BuildT Build type of the grid being called
 template<typename BuildT>
 struct ProbeValue
@@ -8171,7 +6583,7 @@ struct ProbeValue
     }
 }; // ProbeValue<BuildT>
 
-/// @brief Implements Tree::getNodeInfo(Coord)
+/// @brief Implements Tree::getNodeInfo(math::Coord)
 /// @tparam BuildT Build type of the grid being called
 template<typename BuildT>
 struct GetNodeInfo
@@ -8207,6 +6619,6 @@ struct GetNodeInfo
     }
 }; // GetNodeInfo<BuildT>
 
-} // namespace nanovdb
+} // namespace nanovdb ===================================================================
 
 #endif // end of NANOVDB_NANOVDB_H_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/NodeManager.h b/nanovdb/nanovdb/NodeManager.h
new file mode 100644
index 0000000000..0e95ecf872
--- /dev/null
+++ b/nanovdb/nanovdb/NodeManager.h
@@ -0,0 +1,327 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    \file nanovdb/NodeManager.h
+
+    \author Ken Museth
+
+    \date February 12, 2021
+
+    \brief This class allows for sequential access to nodes
+           in a NanoVDB tree on both the host and device.
+
+    \details The ordering of the sequential access to nodes is always breadth-first!
+*/
+
+#include <nanovdb/NanoVDB.h>// for NanoGrid etc
+#include <nanovdb/HostBuffer.h>// for HostBuffer
+
+#ifndef NANOVDB_NODEMANAGER_H_HAS_BEEN_INCLUDED
+#define NANOVDB_NODEMANAGER_H_HAS_BEEN_INCLUDED
+
+namespace nanovdb {
+
+/// @brief NodeManager allows for sequential access to nodes
+template <typename BuildT>
+class NodeManager;
+
+/// @brief NodeManagerHandle manages the memory of a NodeManager
+template<typename BufferT = HostBuffer>
+class NodeManagerHandle;
+
+/// @brief brief Construct a NodeManager and return its handle
+///
+/// @param grid grid whose nodes will be accessed sequentially
+/// @param buffer buffer from which to allocate the output handle
+///
+/// @note This is the only way to create a NodeManager since it's using
+///       managed memory pointed to by a NodeManagerHandle.
+template <typename BuildT, typename BufferT = HostBuffer>
+NodeManagerHandle<BufferT> createNodeManager(const NanoGrid<BuildT> &grid,
+                                             const BufferT& buffer = BufferT());
+
+struct NodeManagerData
+{// 48B = 6*8B
+    uint64_t        mMagic;// 8B
+    union {int64_t  mPadding; uint8_t mLinear;};// 8B of which 1B is used for a binary flag
+    void           *mGrid;//  8B pointer to either host or device grid
+    union {int64_t *mPtr[3], mOff[3];};// 24B, use mOff if mLinear!=0
+};
+
+/// @brief This class serves to manage a raw memory buffer of a NanoVDB NodeManager or LeafManager.
+template<typename BufferT>
+class NodeManagerHandle
+{
+    GridType mGridType{GridType::Unknown};
+    BufferT  mBuffer;
+
+    template<typename BuildT>
+    const NodeManager<BuildT>* getMgr() const {
+        return mGridType == toGridType<BuildT>() ? (const NodeManager<BuildT>*)mBuffer.data() : nullptr;
+    }
+
+    template<typename BuildT, typename U = BufferT>
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, const NodeManager<BuildT>*>::type
+    getDeviceMgr() const {
+        return mGridType == toGridType<BuildT>() ? (const NodeManager<BuildT>*)mBuffer.deviceData() : nullptr;
+    }
+
+    template <typename T>
+    static T* no_const(const T* ptr) { return const_cast<T*>(ptr); }
+
+public:
+    /// @brief Move constructor from a buffer
+    NodeManagerHandle(GridType gridType, BufferT&& buffer) : mGridType(gridType) { mBuffer = std::move(buffer); }
+    /// @brief Empty ctor
+    NodeManagerHandle() = default;
+    /// @brief Disallow copy-construction
+    NodeManagerHandle(const NodeManagerHandle&) = delete;
+    /// @brief Disallow copy assignment operation
+    NodeManagerHandle& operator=(const NodeManagerHandle&) = delete;
+    /// @brief Move copy assignment operation
+    NodeManagerHandle& operator=(NodeManagerHandle&& other) noexcept {
+        mGridType = other.mGridType;
+        mBuffer = std::move(other.mBuffer);
+        other.mGridType = GridType::Unknown;
+        return *this;
+    }
+    /// @brief Move copy-constructor
+    NodeManagerHandle(NodeManagerHandle&& other) noexcept {
+        mGridType = other.mGridType;
+        mBuffer = std::move(other.mBuffer);
+        other.mGridType = GridType::Unknown;
+    }
+    /// @brief Default destructor
+    ~NodeManagerHandle() { this->reset(); }
+    /// @brief clear the buffer
+    void reset() { mBuffer.clear(); }
+
+    /// @brief Return a reference to the buffer
+    BufferT& buffer() { return mBuffer; }
+
+    /// @brief Return a const reference to the buffer
+    const BufferT& buffer() const { return mBuffer; }
+
+    /// @brief Returns a non-const pointer to the data.
+    ///
+    /// @warning Note that the return pointer can be NULL if the NodeManagerHandle was not initialized
+    void* data() { return mBuffer.data(); }
+
+    /// @brief Returns a const pointer to the data.
+    ///
+    /// @warning Note that the return pointer can be NULL if the NodeManagerHandle was not initialized
+    const void* data() const { return mBuffer.data(); }
+
+    /// @brief Returns the size in bytes of the raw memory buffer managed by this NodeManagerHandle's allocator.
+    uint64_t size() const { return mBuffer.size(); }
+
+    /// @brief Returns a const pointer to the NodeManager encoded in this NodeManagerHandle.
+    ///
+    /// @warning Note that the return pointer can be NULL if the template parameter does not match the specified grid!
+    template<typename BuildT>
+    const NodeManager<BuildT>* mgr() const { return this->template getMgr<BuildT>(); }
+
+    /// @brief Returns a pointer to the NodeManager encoded in this NodeManagerHandle.
+    ///
+    /// @warning Note that the return pointer can be NULL if the template parameter does not match the specified grid!
+    template<typename BuildT>
+    NodeManager<BuildT>* mgr() { return no_const(this->template getMgr<BuildT>()); }
+
+    /// @brief Return a const pointer to the NodeManager encoded in this NodeManagerHandle on the device, e.g. GPU
+    ///
+    /// @warning Note that the return pointer can be NULL if the template parameter does not match the specified grid!
+    template<typename BuildT, typename U = BufferT>
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, const NodeManager<BuildT>*>::type
+    deviceMgr() const { return this->template getDeviceMgr<BuildT>(); }
+
+    /// @brief Return a const pointer to the NodeManager encoded in this NodeManagerHandle on the device, e.g. GPU
+    ///
+    /// @warning Note that the return pointer can be NULL if the template parameter does not match the specified grid!
+    template<typename BuildT, typename U = BufferT>
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, NodeManager<BuildT>*>::type
+    deviceMgr() { return no_const(this->template getDeviceMgr<BuildT>()); }
+
+    /// @brief Upload the NodeManager to the device, e.g. from CPU to GPU
+    ///
+    /// @note This method is only available if the buffer supports devices
+    template<typename U = BufferT>
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, void>::type
+    deviceUpload(void* deviceGrid, void* stream = nullptr, bool sync = true)
+    {
+        assert(deviceGrid);
+        auto *data = reinterpret_cast<NodeManagerData*>(mBuffer.data());
+        void *tmp = data->mGrid;
+        data->mGrid = deviceGrid;
+        mBuffer.deviceUpload(stream, sync);
+        data->mGrid = tmp;
+    }
+
+    /// @brief Download the NodeManager to from the device, e.g. from GPU to CPU
+    ///
+    /// @note This method is only available if the buffer supports devices
+    template<typename U = BufferT>
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, void>::type
+    deviceDownload(void* stream = nullptr, bool sync = true)
+    {
+        auto *data = reinterpret_cast<NodeManagerData*>(mBuffer.data());
+        void *tmp = data->mGrid;
+        mBuffer.deviceDownload(stream, sync);
+        data->mGrid = tmp;
+    }
+};// NodeManagerHandle
+
+/// @brief This class allows for sequential access to nodes in a NanoVDB tree
+///
+/// @details Nodes are always arranged breadth first during sequential access of nodes
+///          at a particular level.
+template<typename BuildT>
+class NodeManager : private NodeManagerData
+{
+    using DataT = NodeManagerData;
+    using GridT = NanoGrid<BuildT>;
+    using TreeT = typename GridTree<GridT>::type;
+    template<int LEVEL>
+    using NodeT = typename NodeTrait<TreeT, LEVEL>::type;
+    using RootT = NodeT<3>;// root node
+    using Node2 = NodeT<2>;// upper internal node
+    using Node1 = NodeT<1>;// lower internal node
+    using Node0 = NodeT<0>;// leaf node
+
+public:
+    static constexpr bool FIXED_SIZE = Node0::FIXED_SIZE && Node1::FIXED_SIZE && Node2::FIXED_SIZE;
+
+    NodeManager(const NodeManager&) = delete;
+    NodeManager(NodeManager&&) = delete;
+    NodeManager& operator=(const NodeManager&) = delete;
+    NodeManager& operator=(NodeManager&&) = delete;
+    ~NodeManager() = delete;
+
+    /// @brief return true if the nodes have both fixed size and are arranged breadth-first in memory.
+    ///        This allows for direct and memory-efficient linear access to nodes.
+    __hostdev__ static bool isLinear(const GridT &grid) {return FIXED_SIZE && grid.isBreadthFirst();}
+
+    /// @brief return true if the nodes have both fixed size and are arranged breadth-first in memory.
+    ///        This allows for direct and memory-efficient linear access to nodes.
+    __hostdev__ bool isLinear() const {return DataT::mLinear!=0u;}
+
+    /// @brief Return the memory footprint in bytes of the NodeManager derived from the specified grid
+    __hostdev__ static uint64_t memUsage(const GridT &grid) {
+        uint64_t size = sizeof(NodeManagerData);
+        if (!NodeManager::isLinear(grid)) {
+            const uint32_t *p = grid.tree().mNodeCount;
+            size += sizeof(int64_t)*(p[0]+p[1]+p[2]);
+        }
+        return size;
+    }
+
+    /// @brief Return the memory footprint in bytes of this instance
+    __hostdev__ uint64_t memUsage() const {return NodeManager::memUsage(this->grid());}
+
+    /// @brief Return a reference to the grid
+    __hostdev__       GridT& grid()       { return *reinterpret_cast<GridT*>(DataT::mGrid); }
+    __hostdev__ const GridT& grid() const { return *reinterpret_cast<const GridT*>(DataT::mGrid); }
+
+    /// @brief Return a reference to the tree
+    __hostdev__       TreeT& tree()       { return this->grid().tree(); }
+    __hostdev__ const TreeT& tree() const { return this->grid().tree(); }
+
+    /// @brief Return a reference to the root
+    __hostdev__       RootT& root()       { return this->tree().root(); }
+    __hostdev__ const RootT& root() const { return this->tree().root(); }
+
+    /// @brief Return the number of tree nodes at the specified level
+    /// @details 0 is leaf, 1 is lower internal, and 2 is upper internal level
+    __hostdev__ uint64_t nodeCount(int level) const { return this->tree().nodeCount(level); }
+
+    __hostdev__ uint64_t leafCount()  const { return this->tree().nodeCount(0); }
+    __hostdev__ uint64_t lowerCount() const { return this->tree().nodeCount(1); }
+    __hostdev__ uint64_t upperCount() const { return this->tree().nodeCount(2); }
+
+    /// @brief Return the i'th leaf node with respect to breadth-first ordering
+    template <int LEVEL>
+    __hostdev__ const NodeT<LEVEL>& node(uint32_t i) const {
+        NANOVDB_ASSERT(i < this->nodeCount(LEVEL));
+        const NodeT<LEVEL>* ptr = nullptr;
+        if (DataT::mLinear) {
+            ptr = util::PtrAdd<const NodeT<LEVEL>>(DataT::mGrid, DataT::mOff[LEVEL]) + i;
+        } else {
+            ptr = util::PtrAdd<const NodeT<LEVEL>>(DataT::mGrid, DataT::mPtr[LEVEL][i]);
+        }
+        NANOVDB_ASSERT(ptr && isAligned(ptr));
+        return *ptr;
+    }
+
+    /// @brief Return the i'th node with respect to breadth-first ordering
+    template <int LEVEL>
+    __hostdev__ NodeT<LEVEL>& node(uint32_t i) {
+        NANOVDB_ASSERT(i < this->nodeCount(LEVEL));
+        NodeT<LEVEL>* ptr = nullptr;
+        if (DataT::mLinear) {
+            ptr = util::PtrAdd<NodeT<LEVEL>>(DataT::mGrid, DataT::mOff[LEVEL]) + i;
+        } else {
+            ptr = util::PtrAdd<NodeT<LEVEL>>(DataT::mGrid, DataT::mPtr[LEVEL][i]);
+        }
+        NANOVDB_ASSERT(ptr && isAligned(ptr));
+        return *ptr;
+    }
+
+    /// @brief Return the i'th leaf node with respect to breadth-first ordering
+    __hostdev__ const Node0& leaf(uint32_t i) const { return this->node<0>(i); }
+    __hostdev__       Node0& leaf(uint32_t i)       { return this->node<0>(i); }
+
+    /// @brief Return the i'th lower internal node with respect to breadth-first ordering
+    __hostdev__ const Node1& lower(uint32_t i) const { return this->node<1>(i); }
+    __hostdev__       Node1& lower(uint32_t i)       { return this->node<1>(i); }
+
+    /// @brief Return the i'th upper internal node with respect to breadth-first ordering
+    __hostdev__ const Node2& upper(uint32_t i) const { return this->node<2>(i); }
+    __hostdev__       Node2& upper(uint32_t i)       { return this->node<2>(i); }
+
+}; // NodeManager<BuildT> class
+
+template <typename BuildT, typename BufferT>
+NodeManagerHandle<BufferT> createNodeManager(const NanoGrid<BuildT> &grid,
+                                             const BufferT& buffer)
+{
+    NodeManagerHandle<BufferT> handle(toGridType<BuildT>(), BufferT::create(NodeManager<BuildT>::memUsage(grid), &buffer));
+    auto *data = reinterpret_cast<NodeManagerData*>(handle.data());
+    NANOVDB_ASSERT(data && isAligned(data));
+    NANOVDB_ASSERT(toGridType<BuildT>() == grid.gridType());
+#ifdef NANOVDB_USE_NEW_MAGIC_NUMBERS
+    *data = NodeManagerData{NANOVDB_MAGIC_NODE, {0u}, (void*)&grid, {{0u,0u,0u}}};
+#else
+    *data = NodeManagerData{NANOVDB_MAGIC_NUMB, {0u}, (void*)&grid, {{0u,0u,0u}}};
+#endif
+
+    if (NodeManager<BuildT>::isLinear(grid)) {
+        data->mLinear = uint8_t(1u);
+        data->mOff[0] = util::PtrDiff(grid.tree().template getFirstNode<0>(), &grid);
+        data->mOff[1] = util::PtrDiff(grid.tree().template getFirstNode<1>(), &grid);
+        data->mOff[2] = util::PtrDiff(grid.tree().template getFirstNode<2>(), &grid);
+    } else {
+        int64_t *ptr0 = data->mPtr[0] = reinterpret_cast<int64_t*>(data + 1);
+        int64_t *ptr1 = data->mPtr[1] = data->mPtr[0] + grid.tree().nodeCount(0);
+        int64_t *ptr2 = data->mPtr[2] = data->mPtr[1] + grid.tree().nodeCount(1);
+        // Performs depth first traversal but breadth first insertion
+        for (auto it2 = grid.tree().root().cbeginChild(); it2; ++it2) {
+            *ptr2++ = util::PtrDiff(&*it2, &grid);
+            for (auto it1 = it2->cbeginChild(); it1; ++it1) {
+                *ptr1++ = util::PtrDiff(&*it1, &grid);
+                for (auto it0 = it1->cbeginChild(); it0; ++it0) {
+                    *ptr0++ = util::PtrDiff(&*it0, &grid);
+                }// loop over child nodes of the lower internal node
+            }// loop over child nodes of the upper internal node
+        }// loop over child nodes of the root node
+    }
+
+    return handle;// // is converted to r-value so return value is move constructed!
+}
+
+} // namespace nanovdb
+
+#if defined(__CUDACC__)
+#include <nanovdb/cuda/NodeManager.cuh>
+#endif// defined(__CUDACC__)
+
+#endif // NANOVDB_NODEMANAGER_H_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/PNanoVDB.h b/nanovdb/nanovdb/PNanoVDB.h
index 24fb68478c..40888f242c 100644
--- a/nanovdb/nanovdb/PNanoVDB.h
+++ b/nanovdb/nanovdb/PNanoVDB.h
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: MPL-2.0
 
 /*!
-    \file   PNanoVDB.h
+    \file   nanovdb/PNanoVDB.h
 
     \author Andrew Reidmeyer
 
@@ -291,7 +291,11 @@ void pnanovdb_buf_write_uint64(pnanovdb_buf_t buf, uint byte_offset, uvec2 value
 // struct typedef, static const, inout
 #if defined(PNANOVDB_C)
 #define PNANOVDB_STRUCT_TYPEDEF(X) typedef struct X X;
+#if defined(__CUDA_ARCH__)
+#define PNANOVDB_STATIC_CONST constexpr __constant__
+#else
 #define PNANOVDB_STATIC_CONST static const
+#endif
 #define PNANOVDB_INOUT(X) X*
 #define PNANOVDB_IN(X) const X*
 #define PNANOVDB_DEREF(X) (*X)
@@ -929,7 +933,7 @@ PNANOVDB_FORCE_INLINE void pnanovdb_write_vec3(pnanovdb_buf_t buf, pnanovdb_addr
 #define PNANOVDB_MAGIC_FILE   0x324244566f6e614eUL// "NanoVDB2" in hex - little endian (uint64_t)
 
 #define PNANOVDB_MAJOR_VERSION_NUMBER 32// reflects changes to the ABI
-#define PNANOVDB_MINOR_VERSION_NUMBER  6// reflects changes to the API but not ABI
+#define PNANOVDB_MINOR_VERSION_NUMBER  7// reflects changes to the API but not ABI
 #define PNANOVDB_PATCH_VERSION_NUMBER  0// reflects bug-fixes with no ABI or API changes
 
 #define PNANOVDB_GRID_TYPE_UNKNOWN 0
@@ -958,7 +962,8 @@ PNANOVDB_FORCE_INLINE void pnanovdb_write_vec3(pnanovdb_buf_t buf, pnanovdb_addr
 #define PNANOVDB_GRID_TYPE_POINTINDEX 23
 #define PNANOVDB_GRID_TYPE_VEC3U8 24
 #define PNANOVDB_GRID_TYPE_VEC3U16 25
-#define PNANOVDB_GRID_TYPE_END 26
+#define PNANOVDB_GRID_TYPE_UINT8 26
+#define PNANOVDB_GRID_TYPE_END 27
 
 #define PNANOVDB_GRID_CLASS_UNKNOWN 0
 #define PNANOVDB_GRID_CLASS_LEVEL_SET 1     // narrow band level set, e.g. SDF
@@ -989,17 +994,17 @@ PNANOVDB_FORCE_INLINE void pnanovdb_write_vec3(pnanovdb_buf_t buf, pnanovdb_addr
 
 // BuildType = Unknown, float, double, int16_t, int32_t, int64_t, Vec3f, Vec3d, Mask, ...
 // bit count of values in leaf nodes, i.e. 8*sizeof(*nanovdb::LeafNode<BuildType>::mValues) or zero if no values are stored
-PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_value_strides_bits[PNANOVDB_GRID_TYPE_END]  = {  0, 32, 64, 16, 32, 64,  96, 192,  0, 16, 32,  1, 32,  4,  8, 16,  0, 128, 256,  0,  0,  0,  0, 16, 24, 48 };
+PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_value_strides_bits[PNANOVDB_GRID_TYPE_END]  = {  0, 32, 64, 16, 32, 64,  96, 192,  0, 16, 32,  1, 32,  4,  8, 16,  0, 128, 256,  0,  0,  0,  0, 16, 24, 48,  8 };
 // bit count of the Tile union in InternalNodes, i.e. 8*sizeof(nanovdb::InternalData::Tile)
-PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_table_strides_bits[PNANOVDB_GRID_TYPE_END]  = { 64, 64, 64, 64, 64, 64, 128, 192, 64, 64, 64, 64, 64, 64, 64, 64, 64, 128, 256, 64, 64, 64, 64, 64, 64, 64 };
+PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_table_strides_bits[PNANOVDB_GRID_TYPE_END]  = { 64, 64, 64, 64, 64, 64, 128, 192, 64, 64, 64, 64, 64, 64, 64, 64, 64, 128, 256, 64, 64, 64, 64, 64, 64, 64, 64 };
 // bit count of min/max values, i.e. 8*sizeof(nanovdb::LeafData::mMinimum) or zero if no min/max exists
-PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_minmax_strides_bits[PNANOVDB_GRID_TYPE_END] = {  0, 32, 64, 16, 32, 64,  96, 192,  8, 16, 32,  8, 32, 32, 32, 32, 32, 128, 256, 64, 64, 64, 64, 64, 24, 48 };
+PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_minmax_strides_bits[PNANOVDB_GRID_TYPE_END] = {  0, 32, 64, 16, 32, 64,  96, 192,  8, 16, 32,  8, 32, 32, 32, 32, 32, 128, 256, 64, 64, 64, 64, 64, 24, 48,  8 };
 // bit alignment of the value type, controlled by the smallest native type, which is why it is always 0, 8, 16, 32, or 64, e.g. for Vec3f it is 32
-PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_minmax_aligns_bits[PNANOVDB_GRID_TYPE_END]  = {  0, 32, 64, 16, 32, 64,  32,  64,  8, 16, 32,  8, 32, 32, 32, 32, 32,  32,  64, 64, 64, 64, 64, 64,  8, 16 };
+PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_minmax_aligns_bits[PNANOVDB_GRID_TYPE_END]  = {  0, 32, 64, 16, 32, 64,  32,  64,  8, 16, 32,  8, 32, 32, 32, 32, 32,  32,  64, 64, 64, 64, 64, 64,  8, 16,  8 };
 // bit alignment of the stats (avg/std-dev) types, e.g. 8*sizeof(nanovdb::LeafData::mAverage)
-PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_stat_strides_bits[PNANOVDB_GRID_TYPE_END]   = {  0, 32, 64, 32, 32, 64,  32,  64,  8, 32, 32,  8, 32, 32, 32, 32, 32,  32,  64, 64, 64, 64, 64, 64, 32, 32 };
+PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_stat_strides_bits[PNANOVDB_GRID_TYPE_END]   = {  0, 32, 64, 32, 32, 64,  32,  64,  8, 32, 32,  8, 32, 32, 32, 32, 32,  32,  64, 64, 64, 64, 64, 64, 32, 32, 32 };
 // one of the 4 leaf types defined above, e.g. PNANOVDB_LEAF_TYPE_INDEX = 3
-PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_leaf_type[PNANOVDB_GRID_TYPE_END]           = {  0,  0,  0,  0,  0,  0,  0,    0,  1,  0,  0,  1,  0,  2,  2,  2,  2,   0,   0,  3,  3,  4,  4,  5,  0,  0 };
+PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_leaf_type[PNANOVDB_GRID_TYPE_END]           = {  0,  0,  0,  0,  0,  0,  0,    0,  1,  0,  0,  1,  0,  2,  2,  2,  2,   0,   0,  3,  3,  4,  4,  5,  0,  0,  0 };
 
 struct pnanovdb_map_t
 {
@@ -1229,9 +1234,9 @@ PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_version_get_patch(pnanovdb_uint
 
 struct pnanovdb_gridblindmetadata_t
 {
-    pnanovdb_int64_t byte_offset;       // 8 bytes,     0
-    pnanovdb_uint64_t element_count;    // 8 bytes,     8
-    pnanovdb_uint32_t flags;            // 4 bytes,     16
+    pnanovdb_int64_t data_offset;       // 8 bytes,     0
+    pnanovdb_uint64_t value_count;      // 8 bytes,     8
+    pnanovdb_uint32_t value_size;       // 4 bytes,     16
     pnanovdb_uint32_t semantic;         // 4 bytes,     20
     pnanovdb_uint32_t data_class;       // 4 bytes,     24
     pnanovdb_uint32_t data_type;        // 4 bytes,     28
@@ -1243,22 +1248,22 @@ PNANOVDB_STRUCT_TYPEDEF(pnanovdb_gridblindmetadata_handle_t)
 
 #define PNANOVDB_GRIDBLINDMETADATA_SIZE 288
 
-#define PNANOVDB_GRIDBLINDMETADATA_OFF_BYTE_OFFSET 0
-#define PNANOVDB_GRIDBLINDMETADATA_OFF_ELEMENT_COUNT 8
-#define PNANOVDB_GRIDBLINDMETADATA_OFF_FLAGS 16
+#define PNANOVDB_GRIDBLINDMETADATA_OFF_DATA_OFFSET 0
+#define PNANOVDB_GRIDBLINDMETADATA_OFF_VALUE_COUNT 8
+#define PNANOVDB_GRIDBLINDMETADATA_OFF_VALUE_SIZE 16
 #define PNANOVDB_GRIDBLINDMETADATA_OFF_SEMANTIC 20
 #define PNANOVDB_GRIDBLINDMETADATA_OFF_DATA_CLASS 24
 #define PNANOVDB_GRIDBLINDMETADATA_OFF_DATA_TYPE 28
 #define PNANOVDB_GRIDBLINDMETADATA_OFF_NAME 32
 
-PNANOVDB_FORCE_INLINE pnanovdb_int64_t pnanovdb_gridblindmetadata_get_byte_offset(pnanovdb_buf_t buf, pnanovdb_gridblindmetadata_handle_t p) {
-    return pnanovdb_read_int64(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRIDBLINDMETADATA_OFF_BYTE_OFFSET));
+PNANOVDB_FORCE_INLINE pnanovdb_int64_t pnanovdb_gridblindmetadata_get_data_offset(pnanovdb_buf_t buf, pnanovdb_gridblindmetadata_handle_t p) {
+    return pnanovdb_read_int64(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRIDBLINDMETADATA_OFF_DATA_OFFSET));
 }
-PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_gridblindmetadata_get_element_count(pnanovdb_buf_t buf, pnanovdb_gridblindmetadata_handle_t p) {
-    return pnanovdb_read_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRIDBLINDMETADATA_OFF_ELEMENT_COUNT));
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_gridblindmetadata_get_value_count(pnanovdb_buf_t buf, pnanovdb_gridblindmetadata_handle_t p) {
+    return pnanovdb_read_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRIDBLINDMETADATA_OFF_VALUE_COUNT));
 }
-PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_gridblindmetadata_get_flags(pnanovdb_buf_t buf, pnanovdb_gridblindmetadata_handle_t p) {
-    return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRIDBLINDMETADATA_OFF_FLAGS));
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_gridblindmetadata_get_value_size(pnanovdb_buf_t buf, pnanovdb_gridblindmetadata_handle_t p) {
+    return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRIDBLINDMETADATA_OFF_VALUE_SIZE));
 }
 PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_gridblindmetadata_get_semantic(pnanovdb_buf_t buf, pnanovdb_gridblindmetadata_handle_t p) {
     return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRIDBLINDMETADATA_OFF_SEMANTIC));
@@ -1662,6 +1667,7 @@ PNANOVDB_STATIC_CONST pnanovdb_grid_type_constants_t pnanovdb_grid_type_constant
 {32, 40, 48, 56, 64, 96,  16, 8, 24, 32,  8224, 8232, 8240, 8248, 8256, 270400,  1056, 1064, 1072, 1080, 1088, 33856,  80, 88, 96, 96, 96, 1120},
 {28, 31, 34, 40, 44, 64,  24, 8, 20, 32,  8224, 8227, 8232, 8236, 8256, 270400,  1056, 1059, 1064, 1068, 1088, 33856,  80, 83, 88, 92, 96, 1632},
 {28, 34, 40, 48, 52, 64,  48, 8, 20, 32,  8224, 8230, 8236, 8240, 8256, 270400,  1056, 1062, 1068, 1072, 1088, 33856,  80, 86, 92, 96, 128, 3200},
+{28, 29, 30, 32, 36, 64,  8, 8, 20, 32,  8224, 8225, 8228, 8232, 8256, 270400,  1056, 1057, 1060, 1064, 1088, 33856,  80, 81, 84, 88, 96, 608},
 };
 
 // ------------------------------------------------ Basic Lookup -----------------------------------------------------------
@@ -1678,7 +1684,7 @@ PNANOVDB_FORCE_INLINE pnanovdb_gridblindmetadata_handle_t pnanovdb_grid_get_grid
 PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_grid_get_gridblindmetadata_value_address(pnanovdb_buf_t buf, pnanovdb_grid_handle_t grid, pnanovdb_uint32_t index)
 {
     pnanovdb_gridblindmetadata_handle_t meta = pnanovdb_grid_get_gridblindmetadata(buf, grid, index);
-    pnanovdb_int64_t byte_offset = pnanovdb_gridblindmetadata_get_byte_offset(buf, meta);
+    pnanovdb_int64_t byte_offset = pnanovdb_gridblindmetadata_get_data_offset(buf, meta);
     pnanovdb_address_t address = pnanovdb_address_offset64(meta.address, pnanovdb_int64_as_uint64(byte_offset));
     return address;
 }
diff --git a/nanovdb/nanovdb/cmd/convert/nanovdb_convert.cc b/nanovdb/nanovdb/cmd/convert/nanovdb_convert.cc
index 7a3a5b5170..9133bd7f8c 100644
--- a/nanovdb/nanovdb/cmd/convert/nanovdb_convert.cc
+++ b/nanovdb/nanovdb/cmd/convert/nanovdb_convert.cc
@@ -15,9 +15,9 @@
 #include <algorithm>
 #include <cctype>
 
-#include <nanovdb/util/IO.h> // this is required to read (and write) NanoVDB files on the host
-#include <nanovdb/util/CreateNanoGrid.h>
-#include <nanovdb/util/NanoToOpenVDB.h>
+#include <nanovdb/io/IO.h> // this is required to read (and write) NanoVDB files on the host
+#include <nanovdb/tools/CreateNanoGrid.h>
+#include <nanovdb/tools/NanoToOpenVDB.h>
 
 void usage [[noreturn]] (const std::string& progName, int exitStatus = EXIT_FAILURE)
 {
@@ -47,7 +47,9 @@ void usage [[noreturn]] (const std::string& progName, int exitStatus = EXIT_FAIL
 
 void version [[noreturn]] (const char* progName, int exitStatus = EXIT_SUCCESS)
 {
-    printf("\n%s was build against NanoVDB version %s\n", progName, nanovdb::Version().c_str());
+    char str[8];
+    nanovdb::toStr(str, nanovdb::Version());
+    printf("\n%s was build against NanoVDB version %s\n", progName, str);
     exit(exitStatus);
 }
 
@@ -56,8 +58,8 @@ int main(int argc, char* argv[])
     int exitStatus = EXIT_SUCCESS;
 
     nanovdb::io::Codec       codec = nanovdb::io::Codec::NONE;// compression codec for the file
-    nanovdb::StatsMode       sMode = nanovdb::StatsMode::Default;
-    nanovdb::ChecksumMode    cMode = nanovdb::ChecksumMode::Default;
+    nanovdb::tools::StatsMode       sMode = nanovdb::tools::StatsMode::Default;
+    nanovdb::CheckMode    cMode = nanovdb::CheckMode::Default;
     nanovdb::GridType        qMode = nanovdb::GridType::Unknown;//specify the quantization mode
     bool                     verbose = false, overwrite = false, dither = false, absolute = true;
     float                    tolerance = -1.0f;
@@ -99,11 +101,11 @@ int main(int argc, char* argv[])
                     std::string str(argv[++i]);
                     toLowerCase(str);
                     if (str == "none") {
-                       cMode = nanovdb::ChecksumMode::Disable;
+                       cMode = nanovdb::CheckMode::Disable;
                     } else if (str == "partial") {
-                       cMode = nanovdb::ChecksumMode::Partial;
+                       cMode = nanovdb::CheckMode::Partial;
                     } else if (str == "full") {
-                       cMode = nanovdb::ChecksumMode::Full;
+                       cMode = nanovdb::CheckMode::Full;
                     } else {
                       std::cerr << "Expected one of the following checksum modes: {none, partial, full}\n" << std::endl;
                       usage(argv[0]);
@@ -117,13 +119,13 @@ int main(int argc, char* argv[])
                     std::string str(argv[++i]);
                     toLowerCase(str);
                     if (str == "none") {
-                       sMode = nanovdb::StatsMode::Disable;
+                       sMode = nanovdb::tools::StatsMode::Disable;
                     } else if (str == "bbox") {
-                       sMode = nanovdb::StatsMode::BBox;
+                       sMode = nanovdb::tools::StatsMode::BBox;
                     } else if (str == "extrema") {
-                       sMode = nanovdb::StatsMode::MinMax;
+                       sMode = nanovdb::tools::StatsMode::MinMax;
                     } else if (str == "all") {
-                       sMode = nanovdb::StatsMode::All;
+                       sMode = nanovdb::tools::StatsMode::All;
                     } else {
                       std::cerr << "Expected one of the following stats modes: {none, bbox, extrema, all}\n" << std::endl;
                       usage(argv[0]);
@@ -136,7 +138,7 @@ int main(int argc, char* argv[])
                 } else {
                     qMode = nanovdb::GridType::FpN;
                     absolute = true;
-                    tolerance = atof(argv[++i]);
+                    tolerance = static_cast<float>(atof(argv[++i]));
                 }
             } else if (arg == "-r" || arg == "--rel-error") {
                 if (i + 1 == argc) {
@@ -145,7 +147,7 @@ int main(int argc, char* argv[])
                 } else {
                     qMode = nanovdb::GridType::FpN;
                     absolute = false;
-                    tolerance = atof(argv[++i]);
+                    tolerance = static_cast<float>(atof(argv[++i]));
                 }
             } else if (arg == "-g" || arg == "--grid") {
                 if (i + 1 == argc) {
@@ -203,7 +205,7 @@ int main(int argc, char* argv[])
     {
         using SrcGridT = openvdb::FloatGrid;
         if (auto floatGrid = openvdb::GridBase::grid<SrcGridT>(base)) {
-            nanovdb::CreateNanoGrid<SrcGridT> s(*floatGrid);
+            nanovdb::tools::CreateNanoGrid<SrcGridT> s(*floatGrid);
             s.setStats(sMode);
             s.setChecksum(cMode);
             s.enableDithering(dither);
@@ -217,15 +219,15 @@ int main(int argc, char* argv[])
                 return s.getHandle<nanovdb::Fp16>();
             case nanovdb::GridType::FpN:
                 if (absolute) {
-                    return s.getHandle<nanovdb::FpN>(nanovdb::AbsDiff(tolerance));
+                    return s.getHandle<nanovdb::FpN>(nanovdb::tools::AbsDiff(tolerance));
                 } else {
-                    return s.getHandle<nanovdb::FpN>(nanovdb::RelDiff(tolerance));
+                    return s.getHandle<nanovdb::FpN>(nanovdb::tools::RelDiff(tolerance));
                 }
             default:
                 break;
             }// end of switch
         }
-        return nanovdb::openToNanoVDB(base, sMode, cMode, verbose ? 1 : 0);
+        return nanovdb::tools::openToNanoVDB(base, sMode, cMode, verbose ? 1 : 0);
     };
     try {
         if (toNanoVDB) { // OpenVDB -> NanoVDB
@@ -275,7 +277,7 @@ int main(int argc, char* argv[])
                         for (uint32_t i = 0; i < h.gridCount(); ++i) {
                             if (verbose)
                                 std::cout << "Converting NanoVDB grid named \"" << h.gridMetaData(i)->shortGridName() << "\" to OpenVDB" << std::endl;
-                            grids->push_back(nanoToOpenVDB(h, 0, i));
+                            grids->push_back(nanovdb::tools::nanoToOpenVDB(h, 0, i));
                         }
                     }
                 } else {
@@ -286,7 +288,7 @@ int main(int argc, char* argv[])
                     }
                     if (verbose)
                         std::cout << "Converting NanoVDB grid named \"" << handle.gridMetaData()->shortGridName() << "\" to OpenVDB" << std::endl;
-                    grids->push_back(nanoToOpenVDB(handle));
+                    grids->push_back(nanovdb::tools::nanoToOpenVDB(handle));
                 }
             } // loop over input files
             file.write(*grids);
diff --git a/nanovdb/nanovdb/cmd/print/nanovdb_print.cc b/nanovdb/nanovdb/cmd/print/nanovdb_print.cc
index 5336a07190..dd091e75a7 100644
--- a/nanovdb/nanovdb/cmd/print/nanovdb_print.cc
+++ b/nanovdb/nanovdb/cmd/print/nanovdb_print.cc
@@ -11,7 +11,7 @@
     \brief  Command-line tool that prints information about grids in a nanovdb file
 */
 
-#include <nanovdb/util/IO.h> // this is required to read (and write) NanoVDB files on the host
+#include <nanovdb/io/IO.h> // this is required to read (and write) NanoVDB files on the host
 #include <iomanip>
 #include <sstream>
 
@@ -31,7 +31,9 @@ void usage [[noreturn]] (const std::string& progName, int exitStatus = EXIT_FAIL
 
 void version [[noreturn]] (const char* progName, int exitStatus = EXIT_SUCCESS)
 {
-    printf("\n%s was build against NanoVDB version %s\n", progName, nanovdb::Version().c_str());
+    char str[8];
+    nanovdb::toStr(str, nanovdb::Version());
+    printf("\n%s was build against NanoVDB version %s\n", progName, str);
     exit(exitStatus);
 }
 
@@ -42,6 +44,7 @@ int main(int argc, char* argv[])
     enum Mode : int { Short = 0,
                       Default = 1,
                       Long = 2 } mode = Default;
+    char str[32];
     bool verbose = false;
     std::string              gridName;
     std::vector<std::string> fileNames;
@@ -109,7 +112,7 @@ int main(int argc, char* argv[])
         ss << "(" << v[0] << "," << v[1] << "," << v[2] << ")";
         return ss.str();
     };
-    auto wbboxToStr = [](const nanovdb::BBox<nanovdb::Vec3d>& bbox) {
+    auto wbboxToStr = [](const nanovdb::math::BBox<nanovdb::Vec3d>& bbox) {
         std::stringstream ss;
         if (bbox.empty()) {
             ss << "empty grid";
@@ -174,15 +177,15 @@ int main(int argc, char* argv[])
             auto       resWidth = std::string("Resolution").length() + padding;
             for (auto& m : list) {
                 width(nameWidth, m.gridName);
-                width(typeWidth, nanovdb::toStr(m.gridType));
-                width(classWidth, nanovdb::toStr(m.gridClass));
-                width(codecWidth, nanovdb::io::toStr(m.codec));
+                width(typeWidth, nanovdb::toStr(str, m.gridType));
+                width(classWidth, nanovdb::toStr(str, m.gridClass));
+                width(codecWidth, nanovdb::io::toStr(str, m.codec));
                 width(wbboxWidth, wbboxToStr(m.worldBBox));
                 width(ibboxWidth, ibboxToStr(m.indexBBox));
                 width(resWidth, resToStr(m.indexBBox));
                 width(sizeWidth, format(m.gridSize));
                 width(fileWidth, format(m.fileSize));
-                width(versionWidth, std::string(m.version.c_str()));
+                width(versionWidth, nanovdb::toStr(str, m.version));
                 width(configWidth, nodesToStr(m.nodeCount));
                 width(tileWidth, nodesToStr(m.tileCount));
                 width(voxelsWidth, std::to_string(m.voxelCount));
@@ -220,11 +223,11 @@ int main(int argc, char* argv[])
                     continue;
                 std::cout << std::left << std::setw(numberWidth) << ++n
                           << std::left << std::setw(nameWidth) << m.gridName
-                          << std::left << std::setw(typeWidth) << nanovdb::toStr(m.gridType);
+                          << std::left << std::setw(typeWidth) << nanovdb::toStr(str, m.gridType);
                 if (mode != Short) {
-                    std::cout << std::left << std::setw(classWidth) << nanovdb::toStr(m.gridClass)
-                              << std::left << std::setw(versionWidth) << std::string(m.version.c_str())
-                              << std::left << std::setw(codecWidth) << nanovdb::io::toStr(m.codec)
+                    std::cout << std::left << std::setw(classWidth) << nanovdb::toStr(str, m.gridClass)
+                              << std::left << std::setw(versionWidth) << nanovdb::toStr(str+10, m.version)
+                              << std::left << std::setw(codecWidth) << nanovdb::io::toStr(str + 20, m.codec)
                               << std::left << std::setw(sizeWidth) << format(m.gridSize)
                               << std::left << std::setw(fileWidth) << format(m.fileSize)
                               << std::left << std::setw(voxelSizeWidth) << Vec3dToStr(m.voxelSize);
@@ -321,4 +324,4 @@ int main(int argc, char* argv[])
     }
 
     return exitStatus;
-}
+}// main
diff --git a/nanovdb/nanovdb/cmd/updateFiles.py b/nanovdb/nanovdb/cmd/updateFiles.py
new file mode 100644
index 0000000000..e4041c91f6
--- /dev/null
+++ b/nanovdb/nanovdb/cmd/updateFiles.py
@@ -0,0 +1,220 @@
+import argparse
+import os
+from pathlib import Path
+
+
+def open_file(file_path):
+    """
+    Opens a file. If utf-8 decoding fails, try windows-1252.
+
+    Args:
+        file_path: Path of the file to open.
+
+    Returns:
+        The content of the file in an arbitrary format.
+    """
+    try:
+        with open(file_path, "r", encoding="utf-8", errors="replace") as file:
+            return file.read()
+    except UnicodeDecodeError:
+        with open(file_path, "r", encoding="windows-1252", errors="replace") as file:
+            return file.read()
+
+
+def write_file(file_path, content):
+    """
+    Writes a file. If utf-8 decoding fails, try windows-1252.
+
+    Args:
+        file_path: Path of the file to open.
+
+    Returns:
+        None.
+    """
+    try:
+        with open(file_path, "w", encoding="utf-8", errors="replace") as file:
+            file.write(content)
+    except UnicodeDecodeError:
+        with open(file_path, "w", encoding="windows-1252", errors="replace") as file:
+            file.write(content)
+
+
+def update_files(dir_path):
+    """
+    Updates the content of files ending in .h, .cuh, .cc, .cu, and .cpp
+    to call the appropriate API as we update NanoVDB from version 32.6 to
+    version 32.7. This includes changes in namespaces, function names, and
+    include directories.
+
+    Args:
+        Directory path: will include files in downstream directories.
+
+    Returns:
+        None. Writes the contents of the file.
+    """
+
+    # List of file extensions to search for
+    file_extensions = [".h", ".cuh", ".cc", ".cu", ".cpp"]
+
+    nspace_dic = {
+        "math": [
+            "Ray",
+            "DDA<",
+            "HDDA",
+            "Vec3<",
+            "Vec4<",
+            "BBox<",
+            "ZeroCrossing",
+            "TreeMarcher",
+            "PointTreeMarcher",
+            "BoxStencil<",
+            "CurvatureStencil<",
+            "GradStencil<",
+            "WenoStencil<",
+            "AlignUp",
+            "Min",
+            "Max",
+            "Abs",
+            "Clamp",
+            "Sqrt",
+            "Sign",
+            "Maximum<",
+            "Delta<",
+            "RoundDown<",
+            "pi<",
+            "isApproxZero<",
+            "Round<",
+            "createSampler",
+            "SampleFromVoxels<",
+        ],
+        "tools": [
+            "createNanoGrid",
+            "StatsMode",
+            "createLevelSetSphere",
+            "createFogVolumeSphere",
+            "createFogVolumeSphere createFogVolumeSphere",
+            "createFogVolumeTorus",
+            "createLevelSetBox",
+            "CreateNanoGrid",
+            "updateGridStats",
+            "evalChecksum",
+            "validateChecksum",
+            "checkGrid",
+            "Extrema",
+        ],
+        "util": [
+            "is_floating_point",
+            "findLowestOn",
+            "findHighestOn",
+            "Range",
+            "streq",
+            "strcpy",
+            "strcat",
+            "empty(",
+            "Split",
+            "invoke",
+            "forEach",
+            "reduce",
+            "prefixSum",
+            "is_same",
+            "is_specialization",
+            "PtrAdd",
+            "PtrDiff",
+        ],
+    }
+
+    rename_dic = {
+        # list from func4 in updateFiles.sh
+        "nanovdb::build::": "nanovdb::tools::build::",
+        "nanovdb::BBoxR": "nanovdb::Vec3dBBox",
+        "nanovdb::BBox<nanovdb::Vec3d>": "nanovdb::Vec3dBbox",
+        # scope and rename, i.e. list from func2 in updateFiles.sh
+        "nanovdb::cudaCreateNodeManager": "nanovdb::cuda::createNodeManager",
+        "nanovdb::cudaVoxelsToGrid": "nanovdb::cuda::voxelsToGrid",
+        "nanovdb::cudaPointsToGrid": "nanovdb::cuda::pointsToGrid",
+        "nanovdb::DitherLUT": "nanovdb::math::DitherLUT",
+        "nanovdb::PackedRGBA8": "nanovdb::math::Rgba8",
+        "nanovdb::Rgba8": "nanovdb::math::Rgba8",
+        "nanovdb::CpuTimer": "nanovdb::util::Timer",
+        "nanovdb::GpuTimer": "nanovdb::util::cuda::Timer",
+        "nanovdb::CountOn": "nanovdb::util::countOn",
+    }
+
+    movdir_dic = {
+        # list comes from func3 calls on updateFiles.sh
+        "util/GridHandle.h": "GridHandle.h",
+        "util/BuildGrid.h": "tools/GridBuilder.h",
+        "util/GridBuilder.h": "tools/GridBuilder.h",
+        "util/IO.h": "io/IO.h",
+        "util/CSampleFromVoxels.h": "math/CSampleFromVoxels.h",
+        "util/DitherLUT.h": "math/DitherLUT.h",
+        "util/HDDA.h": "math/HDDA.h",
+        "util/Ray.h": "math/Ray.h",
+        "util/SampleFromVoxels.h": "math/SampleFromVoxels.h",
+        "util/Stencils.h": "nanovdb/math/Stencils.h",
+        "util/CreateNanoGrid.h": "tools/CreateNanoGrid.h",
+        "util/Primitives.h": "tools/CreatePrimitives.h",
+        "util/GridChecksum.h": "tools/GridChecksum.h",
+        "util/GridStats.h": "tools/GridStats.h",
+        "util/GridChecksum.h": "tools/GridChecksum.h",
+        "util/GridValidator.h": "tools/GridValidator.h",
+        "util/NanoToOpenVDB.h": "tools/NanoToOpenVDB.h",
+        "util/cuda/CudaGridChecksum.cuh": "tools/cuda/CudaGridChecksum.cuh",
+        "util/cuda/CudaGridStats.cuh": "tools/cuda/CudaGridStats.cuh",
+        "util/cuda/CudaGridValidator.cuh": "tools/cuda/CudaGridValidator.cuh",
+        "util/cuda/CudaIndexToGrid.cuh": "tools/cuda/CudaIndexToGrid.cuh",
+        "util/cuda/CudaPointsToGrid.cuh": "tools/cuda/PointsToGrid.cuh",
+        "util/cuda/CudaSignedFloodFill.cuh": "tools/cuda/CudaSignedFloodFill.cuh",
+        "util/cuda/CudaDeviceBuffer.h": "cuda/DeviceBuffer.h",
+        "util/cuda/CudaGridHandle.cuh": "cuda/GridHandle.cuh",
+        "util/cuda/CudaUtils.h": "util/cuda/Util.h",
+        "util/cuda/GpuTimer.h": "util/cuda/Timer.h",
+    }
+
+    # Iterate over files in the directory and its subdirectories
+    for root, dirs, files in os.walk(dir_path):
+        for file in files:
+            if any(file.endswith(ext) for ext in file_extensions):
+                file_path = os.path.join(root, file)
+                print(f"Processing file: {file_path}")
+
+                content = open_file(file_path)
+
+                # Correspond to func1 $file in updateFiles.sh
+                for key, vals in nspace_dic.items():
+                    for val in vals:
+                        old_word = "nanovdb::" + val
+                        new_word = "nanovdb::" + key + "::" + val
+                        content = content.replace(old_word, new_word)
+
+                # Correspond to func4 and func2 in updateFiles.sh
+                for key, val in rename_dic.items():
+                    content = content.replace(key, val)
+
+                # Correspond to func3 in updateFiles.sh
+                for key, val in movdir_dic.items():
+                    old_path = "<nanovdb/" + key + ">"
+                    new_path = "<nanovdb/" + val + ">"
+                    content = content.replace(old_path, new_path)
+
+                write_file(file_path, content)
+
+# Example use:
+# To update all the files using NanoVDB in the current directory (and directories downstream):
+# python ./nanovdb/nanovdb/cmd/updateFiles.py
+# To update all the files using NanoVDB in a directory called foo (and directories downstream):
+# python ./nanovdb/nanovdb/cmd/updateFiles.py -d /path/to/foo
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Synthetic Data Generation for USD")
+    parser.add_argument(
+        "-d",
+        "--directory",
+        type=str,
+        default=None,
+        help="Path to directory containing .h, .cc, and .cu files using NanoVDB.",
+    )
+
+    args = parser.parse_args()
+    dir_path = os.getcwd() if args.directory is None else Path(args.directory).resolve()
+
+    update_files(dir_path)
diff --git a/nanovdb/nanovdb/cmd/updateFiles.sh b/nanovdb/nanovdb/cmd/updateFiles.sh
new file mode 100755
index 0000000000..87613c3a44
--- /dev/null
+++ b/nanovdb/nanovdb/cmd/updateFiles.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+#Usage process all files in this directory or optionally specify a target directory
+
+# Define directory in which to find files
+dir="."
+if [ "$1" ]; then
+    dir="$1"
+fi
+
+# Check if dir is not a directory
+if [ ! -d "$dir" ]; then
+  echo -e "\nUsage: '$0 <directory>'\n"
+  exit 1
+fi
+
+# E.g.: func1 $file "math" "Coord" "Vec3" "Vec4"
+func1 () {
+    for ((i=3; i<=$#; i++)); do
+        arg="s/nanovdb::${!i}/nanovdb::$2::${!i}/g"
+        #echo "sed -i $arg $1"
+        sed -i $arg $1
+    done
+}
+
+# E.G.: func2 file namespace old new : nanovdb::old -> nanovdb::namespace::new in file
+func2 () {
+    arg="s/nanovdb::$3/nanovdb::$2::$4/g"
+    #echo "sed -i $arg $1"
+    sed -i $arg $1
+}
+
+# E.G.: func3 file path1/old.h path2/new.h  <nanovdb/path1/old.h> -> <nanovdb/path2/new.h> in file
+func3 () {
+    arg="s;<nanovdb/$2>;<nanovdb/$3>;g"
+    #echo "sed -i $arg $1"
+    sed -i $arg $1
+}
+
+# E.g.: func4 file old new   new -> old
+func4 () {
+    arg="s;$2;$3;g"
+    #echo "sed -i $arg $1"
+    sed -i $arg $1
+}
+
+# Loop through files in the target directory
+for file in $(find "$dir" -name '*.h' -or -name '*.cuh' -or -name '*.cc' -or -name '*.cu' -or -name '*.cpp'); do
+  if [ -f "$file" ]; then
+    echo "Processing file: $file"
+    func1 $file math Ray "DDA<" HDDA "Vec3<" "Vec4<" "BBox<" ZeroCrossing TreeMarcher PointTreeMarcher\
+                "BoxStencil<" "CurvatureStencil<" "GradStencil<" "WenoStencil<" AlignUp Min Max Abs Clamp\
+                Sqrt Sign "Maximum<" "Delta<" "RoundDown<" "pi<" "isApproxZero<" "Round<" createSampler "SampleFromVoxels<"
+    func1 $file tools createNanoGrid StatsMode createLevelSetSphere\
+                createFogVolumeSphere createFogVolumeSphere createFogVolumeSphere\
+                createFogVolumeTorus createLevelSetBox CreateNanoGrid updateGridStats\
+                evalChecksum validateChecksum checkGrid Extrema
+    func1 $file util is_floating_point findLowestOn findHighestOn Range streq strcpy strcat "empty("\
+                Split invoke forEach reduce prefixSum is_same is_specialization PtrAdd PtrDiff
+    func4 $file "nanovdb::build::" "nanovdb::tools::build::"
+    func4 $file "nanovdb::BBoxR" "nanovdb::Vec3dBBox"
+    func4 $file "nanovdb::BBox<nanovdb::Vec3d>" "nanovdb::Vec3dBbox"
+    func2 $file cuda cudaCreateNodeManager createNodeManager
+    func2 $file cuda cudaVoxelsToGrid voxelsToGrid
+    func2 $file cuda cudaPointsToGrid pointsToGrid
+    func2 $file math DitherLUT DitherLUT
+    func2 $file math PackedRGBA8 Rgba8
+    func2 $file math Rgba8 Rgba8
+    func2 $file util CpuTimer Timer
+    func2 $file util GpuTimer "cuda::Timer"
+    func2 $file util CountOn countOn
+    func3 $file "util/GridHandle.h" "GridHandle.h"
+    func3 $file "util/BuildGrid.h"   "tools/GridBuilder.h"
+    func3 $file "util/GridBuilder.h" "tools/GridBuilder.h"
+    func3 $file "util/IO.h" "io/IO.h"
+    func3 $file "util/CSampleFromVoxels.h" "math/CSampleFromVoxels.h"
+    func3 $file "util/DitherLUT.h" "math/DitherLUT.h"
+    func3 $file "util/HDDA.h" "math/HDDA.h"
+    func3 $file "util/Ray.h" "math/Ray.h"
+    func3 $file "util/SampleFromVoxels.h" "math/SampleFromVoxels.h"
+    func3 $file "util/Stencils.h" "nanovdb/math/Stencils.h"
+    func3 $file "util/CreateNanoGrid.h" "tools/CreateNanoGrid.h"
+    func3 $file "util/Primitives.h" "tools/CreatePrimitives.h"
+    func3 $file "util/GridChecksum.h" "tools/GridChecksum.h"
+    func3 $file "util/GridStats.h" "tools/GridStats.h"
+    func3 $file "util/GridChecksum.h" "tools/GridChecksum.h"
+    func3 $file "util/GridValidator.h" "tools/GridValidator.h"
+    func3 $file "util/NanoToOpenVDB.h" "tools/NanoToOpenVDB.h"
+    func3 $file "util/cuda/CudaGridChecksum.cuh" "tools/cuda/CudaGridChecksum.cuh"
+    func3 $file "util/cuda/CudaGridStats.cuh" "tools/cuda/CudaGridStats.cuh"
+    func3 $file "util/cuda/CudaGridValidator.cuh" "tools/cuda/CudaGridValidator.cuh"
+    func3 $file "util/cuda/CudaIndexToGrid.cuh" "tools/cuda/CudaIndexToGrid.cuh"
+    func3 $file "util/cuda/CudaPointsToGrid.cuh" "tools/cuda/PointsToGrid.cuh"
+    func3 $file "util/cuda/CudaSignedFloodFill.cuh" "tools/cuda/CudaSignedFloodFill.cuh"
+    func3 $file "util/cuda/CudaDeviceBuffer.h" "cuda/DeviceBuffer.h"
+    func3 $file "util/cuda/CudaGridHandle.cuh" "cuda/GridHandle.cuh"
+    func3 $file "util/cuda/CudaUtils.h" "util/cuda/Util.h"
+    func3 $file "util/cuda/GpuTimer.h" "util/cuda/Timer.h"
+  fi
+done
diff --git a/nanovdb/nanovdb/cmd/validate/nanovdb_validate.cc b/nanovdb/nanovdb/cmd/validate/nanovdb_validate.cc
index faec25aa4d..2d563d92de 100644
--- a/nanovdb/nanovdb/cmd/validate/nanovdb_validate.cc
+++ b/nanovdb/nanovdb/cmd/validate/nanovdb_validate.cc
@@ -11,8 +11,8 @@
     \brief  Command-line tool that validates Grids in nanovdb files
 */
 
-#include <nanovdb/util/IO.h> // this is required to read (and write) NanoVDB files on the host
-#include <nanovdb/util/GridValidator.h>
+#include <nanovdb/io/IO.h> // this is required to read (and write) NanoVDB files on the host
+#include <nanovdb/tools/GridValidator.h>
 #include <iomanip>
 #include <sstream>
 
@@ -23,6 +23,7 @@ void usage [[noreturn]] (const std::string& progName, int exitStatus = EXIT_FAIL
               << "Options:\n"
               << "-g,--grid name\tOnly validate grids matching the specified string name\n"
               << "-h,--help\tPrints this message\n"
+              << "-p,--partial\tPerform partial (i.e. fast) validation tests\n"
               << "-v,--verbose\tPrint verbose information information useful for debugging\n"
               << "--version\tPrint version information to the terminal\n";
     exit(exitStatus);
@@ -30,17 +31,18 @@ void usage [[noreturn]] (const std::string& progName, int exitStatus = EXIT_FAIL
 
 void version [[noreturn]] (const char* progName, int exitStatus = EXIT_SUCCESS)
 {
-    printf("\n%s was build against NanoVDB version %s\n", progName, nanovdb::Version().c_str());
+    char str[8];
+    nanovdb::toStr(str, nanovdb::Version());
+    printf("\n%s was build against NanoVDB version %s\n", progName, str);
     exit(exitStatus);
 }
 
 int main(int argc, char* argv[])
 {
-    int exitStatus = EXIT_SUCCESS;
-
-    bool                     verbose = false;
-    bool                     detailed = true;
-    std::string              gridName;
+    int                exitStatus = EXIT_SUCCESS;
+    bool               verbose = false;
+    nanovdb::CheckMode mode = nanovdb::CheckMode::Full;
+    std::string        gridName;
     std::vector<std::string> fileNames;
     for (int i = 1; i < argc; ++i) {
         std::string arg = argv[i];
@@ -51,6 +53,8 @@ int main(int argc, char* argv[])
                 version(argv[0]);
             } else if (arg == "-v" || arg == "--verbose") {
                 verbose = true;
+            } else if (arg == "-p" || arg == "--partial") {
+                mode = nanovdb::CheckMode::Partial;
             } else if (arg == "-g" || arg == "--grid") {
                 if (i + 1 == argc) {
                     std::cerr << "\nExpected a grid name to follow the -g,--grid option\n";
@@ -79,62 +83,17 @@ int main(int argc, char* argv[])
             if (!gridName.empty()) {
                 std::vector<nanovdb::io::FileGridMetaData> tmp;
                 for (auto& m : list) {
-                    if (nameKey == m.nameKey && gridName == m.gridName)
-                        tmp.emplace_back(m);
+                    if (nameKey == m.nameKey && gridName == m.gridName) tmp.emplace_back(m);
                 }
-                list = tmp;
-            }
-            if (list.size() == 0) {
-                continue;
+                list = std::move(tmp);
             }
+            if (list.size() == 0) continue;
 
-            if (verbose) {
-                std::cout << "\nThe file \"" << file << "\" contains the following matching " << list.size() << " grid(s):\n";
-            }
+            if (verbose) std::cout << "\nThe file \"" << file << "\" contains the following matching " << list.size() << " grid(s):\n";
 
             for (auto& m : list) {
                 auto handle = nanovdb::io::readGrid(file, m.gridName);
-                auto gridType = handle.gridType();
-                bool test = false;
-                if (gridType == nanovdb::GridType::End) {
-                    std::cerr << "GridHandle was empty\n" << std::endl;
-                    usage(argv[0]);
-                } else if (auto* grid = handle.grid<float>()) {
-                    test = isValid(*grid, detailed, verbose);
-                } else if (auto* grid = handle.grid<nanovdb::Vec3f>()) {
-                    test = isValid(*grid, detailed, verbose);
-                } else if (auto* grid = handle.grid<uint32_t>()) {
-                    test = isValid(*grid, detailed, verbose);
-                } else if (auto* grid = handle.grid<int32_t>()) {
-                    test = isValid(*grid, detailed, verbose);
-                } else if (auto* grid = handle.grid<int16_t>()) {
-                    test = isValid(*grid, detailed, verbose);
-                } else if (auto* grid = handle.grid<int64_t>()) {
-                    test = isValid(*grid, detailed, verbose);
-                } else if (auto* grid = handle.grid<int16_t>()) {
-                    test = isValid(*grid, detailed, verbose);
-                } else if (auto* grid = handle.grid<double>()) {
-                    test = isValid(*grid, detailed, verbose);
-                } else if (auto* grid = handle.grid<nanovdb::Vec3d>()) {
-                    test = isValid(*grid, detailed, verbose);
-                } else if (auto* grid = handle.grid<nanovdb::ValueMask>()) {
-                    test = isValid(*grid, detailed, verbose);
-                } else if (auto* grid = handle.grid<nanovdb::Rgba8>()) {
-                    test = isValid(*grid, detailed, verbose);
-                } else if (auto* grid = handle.grid<nanovdb::Fp4>()) {
-                    test = isValid(*grid, detailed, verbose);
-                } else if (auto* grid = handle.grid<nanovdb::Fp8>()) {
-                    test = isValid(*grid, detailed, verbose);
-                } else if (auto* grid = handle.grid<nanovdb::Fp16>()) {
-                    test = isValid(*grid, detailed, verbose);
-                } else if (auto* grid = handle.grid<nanovdb::FpN>()) {
-                    test = isValid(*grid, detailed, verbose);
-                } else if (auto* grid = handle.grid<bool>()) {
-                    test = isValid(*grid, detailed, verbose);
-                } else {
-                    std::cerr << "Unsupported GridType: \"" << nanovdb::toStr(gridType) << "\"\n" << std::endl;
-                    usage(argv[0]);
-                }
+                const bool test = nanovdb::tools::validateGrids(handle, mode, verbose);
                 if (verbose) {
                     std::cout << "Grid named \"" << m.gridName << "\": " << (test ? "passed" : "failed") << std::endl;
                 } else if (!test) {
diff --git a/nanovdb/nanovdb/cuda/DeviceBuffer.h b/nanovdb/nanovdb/cuda/DeviceBuffer.h
new file mode 100644
index 0000000000..171235afbc
--- /dev/null
+++ b/nanovdb/nanovdb/cuda/DeviceBuffer.h
@@ -0,0 +1,231 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    \file DeviceBuffer.h
+
+    \author Ken Museth
+
+    \date January 8, 2020
+
+    \brief Implements a simple dual (host/device) CUDA buffer.
+
+    \note This file has no device-only kernel functions,
+          which explains why it's a .h and not .cuh file.
+*/
+
+#ifndef NANOVDB_CUDA_DEVICEBUFFER_H_HAS_BEEN_INCLUDED
+#define NANOVDB_CUDA_DEVICEBUFFER_H_HAS_BEEN_INCLUDED
+
+#include <nanovdb/HostBuffer.h>// for BufferTraits
+#include <nanovdb/util/cuda/Util.h>// for cudaMalloc/cudaMallocManaged/cudaFree
+
+namespace nanovdb {// ================================================================
+
+namespace cuda {// ===================================================================
+
+// ----------------------------> DeviceBuffer <--------------------------------------
+
+/// @brief Simple memory buffer using un-managed pinned host memory when compiled with NVCC.
+///        Obviously this class is making explicit used of CUDA so replace it with your own memory
+///        allocator if you are not using CUDA.
+/// @note  While CUDA's pinned host memory allows for asynchronous memory copy between host and device
+///        it is significantly slower then cached (un-pinned) memory on the host.
+class DeviceBuffer
+{
+    uint64_t mSize; // total number of bytes managed by this buffer (assumed to be identical for host and device)
+    void *mCpuData, *mGpuData; // raw pointers to the host and device buffers
+    bool mManaged;
+
+public:
+    /// @brief Static factory method that return an instance of this buffer
+    /// @param size byte size of buffer to be initialized
+    /// @param dummy this argument is currently ignored but required to match the API of the HostBuffer
+    /// @param host If true buffer is initialized only on the host/CPU, else on the device/GPU
+    /// @param stream optional stream argument (defaults to stream NULL)
+    /// @return An instance of this class using move semantics
+    static DeviceBuffer create(uint64_t size, const DeviceBuffer* dummy = nullptr, bool host = true, void* stream = nullptr);
+
+    /// @brief Static factory method that return an instance of this buffer that wraps externally managed memory
+    /// @param size byte size of buffer specified by external memory
+    /// @param cpuData pointer to externally managed host memory
+    /// @param gpuData pointer to externally managed device memory
+    /// @return An instance of this class using move semantics
+    static DeviceBuffer create(uint64_t size, void* cpuData, void* gpuData);
+
+    /// @brief Constructor
+    /// @param size byte size of buffer to be initialized
+    /// @param host If true buffer is initialized only on the host/CPU, else on the device/GPU
+    /// @param stream optional stream argument (defaults to stream NULL)
+    DeviceBuffer(uint64_t size = 0, bool host = true, void* stream = nullptr)
+        : mSize(0)
+        , mCpuData(nullptr)
+        , mGpuData(nullptr)
+        , mManaged(false)
+    {
+        if (size > 0) this->init(size, host, stream);
+    }
+
+    DeviceBuffer(uint64_t size, void* cpuData, void* gpuData)
+        : mSize(size)
+        , mCpuData(cpuData)
+        , mGpuData(gpuData)
+        , mManaged(false)
+    {
+    }
+
+    /// @brief Disallow copy-construction
+    DeviceBuffer(const DeviceBuffer&) = delete;
+
+    /// @brief Move copy-constructor
+    DeviceBuffer(DeviceBuffer&& other) noexcept
+        : mSize(other.mSize)
+        , mCpuData(other.mCpuData)
+        , mGpuData(other.mGpuData)
+        , mManaged(other.mManaged)
+    {
+        other.mSize = 0;
+        other.mCpuData = nullptr;
+        other.mGpuData = nullptr;
+        other.mManaged = false;
+    }
+
+    /// @brief Disallow copy assignment operation
+    DeviceBuffer& operator=(const DeviceBuffer&) = delete;
+
+    /// @brief Move copy assignment operation
+    DeviceBuffer& operator=(DeviceBuffer&& other) noexcept
+    {
+        this->clear();
+        mSize = other.mSize;
+        mCpuData = other.mCpuData;
+        mGpuData = other.mGpuData;
+        mManaged = other.mManaged;
+        other.mSize = 0;
+        other.mCpuData = nullptr;
+        other.mGpuData = nullptr;
+        other.mManaged = false;
+        return *this;
+    }
+
+    /// @brief Destructor frees memory on both the host and device
+    ~DeviceBuffer() { this->clear(); };
+
+    /// @brief Initialize buffer
+    /// @param size byte size of buffer to be initialized
+    /// @param host If true buffer is initialized only on the host/CPU, else on the device/GPU
+    /// @note All existing buffers are first cleared
+    /// @warning size is expected to be non-zero. Use clear() clear buffer!
+    void init(uint64_t size, bool host = true, void* stream = nullptr);
+
+    /// @brief Retuns a raw pointer to the host/CPU buffer managed by this allocator.
+    /// @warning Note that the pointer can be NULL!
+    void* data() const { return mCpuData; }
+
+    /// @brief Retuns a raw pointer to the device/GPU buffer managed by this allocator.
+    /// @warning Note that the pointer can be NULL!
+    void* deviceData() const { return mGpuData; }
+
+    /// @brief  Upload this buffer from the host to the device, i.e. CPU -> GPU.
+    /// @param stream optional CUDA stream (defaults to CUDA stream 0)
+    /// @param sync if false the memory copy is asynchronous
+    /// @note If the device/GPU buffer does not exist it is first allocated
+    /// @warning Assumes that the host/CPU buffer already exists
+    void deviceUpload(void* stream = nullptr, bool sync = true) const;
+
+    /// @brief Upload this buffer from the device to the host, i.e. GPU -> CPU.
+    /// @param stream optional CUDA stream (defaults to CUDA stream 0)
+    /// @param sync if false the memory copy is asynchronous
+    /// @note If the host/CPU buffer does not exist it is first allocated
+    /// @warning Assumes that the device/GPU buffer already exists
+    void deviceDownload(void* stream = nullptr, bool sync = true) const;
+
+    /// @brief Returns the size in bytes of the raw memory buffer managed by this allocator.
+    uint64_t size() const { return mSize; }
+
+    //@{
+    /// @brief Returns true if this allocator is empty, i.e. has no allocated memory
+    bool empty() const { return mSize == 0; }
+    bool isEmpty() const { return mSize == 0; }
+    //@}
+
+    /// @brief De-allocate all memory managed by this allocator and set all pointers to NULL
+    void clear(void* stream = nullptr);
+
+}; // DeviceBuffer class
+
+// --------------------------> Implementations below <------------------------------------
+
+inline DeviceBuffer DeviceBuffer::create(uint64_t size, const DeviceBuffer*, bool host, void* stream)
+{
+    return DeviceBuffer(size, host, stream);
+}
+
+inline DeviceBuffer DeviceBuffer::create(uint64_t size, void* cpuData, void* gpuData)
+{
+    return DeviceBuffer(size, cpuData, gpuData);
+}
+
+inline void DeviceBuffer::init(uint64_t size, bool host, void* stream)
+{
+    if (mSize>0) this->clear(stream);
+    NANOVDB_ASSERT(size > 0);
+    if (host) {
+        cudaCheck(cudaMallocHost((void**)&mCpuData, size)); // un-managed pinned memory on the host (can be slow to access!). Always 32B aligned
+        checkPtr(mCpuData, "cuda::DeviceBuffer::init: failed to allocate host buffer");
+    } else {
+        cudaCheck(util::cuda::mallocAsync((void**)&mGpuData, size, reinterpret_cast<cudaStream_t>(stream))); // un-managed memory on the device, always 32B aligned!
+        checkPtr(mGpuData, "cuda::DeviceBuffer::init: failed to allocate device buffer");
+    }
+    mSize = size;
+    mManaged = true;
+} // DeviceBuffer::init
+
+inline void DeviceBuffer::deviceUpload(void* stream, bool sync) const
+{
+    if (!mManaged) throw std::runtime_error("DeviceBuffer::deviceUpload called on externally managed memory. Replace deviceUpload call with the appropriate external copy operation.");
+
+    checkPtr(mCpuData, "uninitialized cpu data");
+    if (mGpuData == nullptr) {
+        cudaCheck(util::cuda::mallocAsync((void**)&mGpuData, mSize, reinterpret_cast<cudaStream_t>(stream))); // un-managed memory on the device, always 32B aligned!
+    }
+    checkPtr(mGpuData, "uninitialized gpu data");
+    cudaCheck(cudaMemcpyAsync(mGpuData, mCpuData, mSize, cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream)));
+    if (sync) cudaCheck(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream)));
+} // DeviceBuffer::gpuUpload
+
+inline void DeviceBuffer::deviceDownload(void* stream, bool sync) const
+{
+    if (!mManaged) throw std::runtime_error("DeviceBuffer::deviceDownload called on externally managed memory. Replace deviceDownload call with the appropriate external copy operation.");
+
+    checkPtr(mGpuData, "uninitialized gpu data");
+    if (mCpuData == nullptr) {
+        cudaCheck(cudaMallocHost((void**)&mCpuData, mSize)); // un-managed pinned memory on the host (can be slow to access!). Always 32B aligned
+    }
+    checkPtr(mCpuData, "uninitialized cpu data");
+    cudaCheck(cudaMemcpyAsync(mCpuData, mGpuData, mSize, cudaMemcpyDeviceToHost, reinterpret_cast<cudaStream_t>(stream)));
+    if (sync) cudaCheck(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream)));
+} // DeviceBuffer::gpuDownload
+
+inline void DeviceBuffer::clear(void *stream)
+{
+    if (mManaged && mGpuData) cudaCheck(util::cuda::freeAsync(mGpuData, reinterpret_cast<cudaStream_t>(stream)));
+    if (mManaged && mCpuData) cudaCheck(cudaFreeHost(mCpuData));
+    mCpuData = mGpuData = nullptr;
+    mSize = 0;
+    mManaged = false;
+} // DeviceBuffer::clear
+
+}// namespace cuda
+
+using CudaDeviceBuffer [[deprecated("Use nanovdb::cuda::DeviceBuffer instead")]] = cuda::DeviceBuffer;
+
+template<>
+struct BufferTraits<cuda::DeviceBuffer>
+{
+    static constexpr bool hasDeviceDual = true;
+};
+
+}// namespace nanovdb
+
+#endif // end of NANOVDB_CUDA_DEVICEBUFFER_H_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/cuda/GridHandle.cuh b/nanovdb/nanovdb/cuda/GridHandle.cuh
new file mode 100644
index 0000000000..db3a99d713
--- /dev/null
+++ b/nanovdb/nanovdb/cuda/GridHandle.cuh
@@ -0,0 +1,145 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    \file nanovdb/cuda/GridHandle.cuh
+
+    \author Ken Museth, Doyub Kim
+
+    \date August 3, 2023
+
+    \brief Contains cuda kernels for GridHandle
+
+    \warning The header file contains cuda device code so be sure
+             to only include it in .cu files (or other .cuh files)
+*/
+
+#ifndef NANOVDB_CUDA_GRIDHANDLE_CUH_HAS_BEEN_INCLUDED
+#define NANOVDB_CUDA_GRIDHANDLE_CUH_HAS_BEEN_INCLUDED
+
+#include <nanovdb/cuda/DeviceBuffer.h>// required for instantiation of move c-tor of GridHandle
+#include <nanovdb/tools/cuda/GridChecksum.cuh>// for cuda::updateChecksum
+#include <nanovdb/GridHandle.h>
+
+namespace nanovdb {
+
+namespace cuda {
+
+namespace {// anonymous namespace
+__global__ void cpyGridHandleMeta(const GridData *d_data, GridHandleMetaData *d_meta)
+{
+    nanovdb::cpyGridHandleMeta(d_data, d_meta);
+}
+
+__global__ void updateGridCount(GridData *d_data, uint32_t gridIndex, uint32_t gridCount, bool *d_dirty)
+{
+    NANOVDB_ASSERT(gridIndex < gridCount);
+    if (*d_dirty = d_data->mGridIndex != gridIndex || d_data->mGridCount != gridCount) {
+        d_data->mGridIndex = gridIndex;
+        d_data->mGridCount = gridCount;
+        if (d_data->mChecksum.isEmpty()) *d_dirty = false;// no need to update checksum if it didn't already exist
+    }
+}
+}// anonymous namespace
+
+template<typename BufferT, template <class, class...> class VectorT = std::vector>
+inline typename util::enable_if<BufferTraits<BufferT>::hasDeviceDual, VectorT<GridHandle<BufferT>>>::type
+splitGridHandles(const GridHandle<BufferT> &handle, const BufferT* other = nullptr, cudaStream_t stream = 0)
+{
+    const void *ptr = handle.deviceData();
+    if (ptr == nullptr) return VectorT<GridHandle<BufferT>>();
+    VectorT<GridHandle<BufferT>> handles(handle.gridCount());
+    bool dirty, *d_dirty;// use this to check if the checksum needs to be recomputed
+    cudaCheck(util::cuda::mallocAsync((void**)&d_dirty, sizeof(bool), stream));
+    for (uint32_t n=0; n<handle.gridCount(); ++n) {
+        auto buffer = BufferT::create(handle.gridSize(n), other, false, stream);
+        GridData *dst = reinterpret_cast<GridData*>(buffer.deviceData());
+        const GridData *src = reinterpret_cast<const GridData*>(ptr);
+        cudaCheck(cudaMemcpyAsync(dst, src, handle.gridSize(n), cudaMemcpyDeviceToDevice, stream));
+        updateGridCount<<<1, 1, 0, stream>>>(dst, 0u, 1u, d_dirty);
+        cudaCheckError();
+        cudaCheck(cudaMemcpyAsync(&dirty, d_dirty, sizeof(bool), cudaMemcpyDeviceToHost, stream));
+        if (dirty) tools::cuda::updateChecksum(dst, CheckMode::Partial, stream);
+        handles[n] = nanovdb::GridHandle<BufferT>(std::move(buffer));
+        ptr = util::PtrAdd(ptr, handle.gridSize(n));
+    }
+    cudaCheck(util::cuda::freeAsync(d_dirty, stream));
+    return std::move(handles);
+}// cuda::splitGridHandles
+
+template<typename BufferT, template <class, class...> class VectorT>
+inline typename util::enable_if<BufferTraits<BufferT>::hasDeviceDual, GridHandle<BufferT>>::type
+mergeGridHandles(const VectorT<GridHandle<BufferT>> &handles, const BufferT* other = nullptr, cudaStream_t stream = 0)
+{
+    uint64_t size = 0u;
+    uint32_t counter = 0u, gridCount = 0u;
+    for (auto &h : handles) {
+        gridCount += h.gridCount();
+        for (uint32_t n=0; n<h.gridCount(); ++n) size += h.gridSize(n);
+    }
+    auto buffer = BufferT::create(size, other, false, stream);
+    void *dst = buffer.deviceData();
+    bool dirty, *d_dirty;// use this to check if the checksum needs to be recomputed
+    cudaCheck(util::cuda::mallocAsync((void**)&d_dirty, sizeof(bool), stream));
+    for (auto &h : handles) {
+        const void *src = h.deviceData();
+        for (uint32_t n=0; n<h.gridCount(); ++n) {
+            cudaCheck(cudaMemcpyAsync(dst, src, h.gridSize(n), cudaMemcpyDeviceToDevice, stream));
+            GridData *data = reinterpret_cast<GridData*>(dst);
+            updateGridCount<<<1, 1, 0, stream>>>(data, counter++, gridCount, d_dirty);
+            cudaCheckError();
+            cudaCheck(cudaMemcpyAsync(&dirty, d_dirty, sizeof(bool), cudaMemcpyDeviceToHost, stream));
+            if (dirty) tools::cuda::updateChecksum(data, CheckMode::Partial, stream);
+            dst = util::PtrAdd(dst, h.gridSize(n));
+            src = util::PtrAdd(src, h.gridSize(n));
+        }
+    }
+    cudaCheck(util::cuda::freeAsync(d_dirty, stream));
+    return GridHandle<BufferT>(std::move(buffer));
+}// cuda::mergeGridHandles
+
+}// namespace cuda
+
+template<typename BufferT, template <class, class...> class VectorT = std::vector>
+[[deprecated("Use nanovdb::cuda::splitGridHandles instead")]]
+inline typename util::enable_if<BufferTraits<BufferT>::hasDeviceDual, VectorT<GridHandle<BufferT>>>::type
+splitDeviceGrids(const GridHandle<BufferT> &handle, const BufferT* other = nullptr, cudaStream_t stream = 0)
+{ return cuda::splitGridHandles(handle, other, stream); }
+
+template<typename BufferT, template <class, class...> class VectorT>
+[[deprecated("Use nanovdb::cuda::mergeGridHandles instead")]]
+inline typename util::enable_if<BufferTraits<BufferT>::hasDeviceDual, GridHandle<BufferT>>::type
+mergeDeviceGrids(const VectorT<GridHandle<BufferT>> &handles, const BufferT* other = nullptr, cudaStream_t stream = 0)
+{ return cuda::mergeGridHandles<BufferT, VectorT>(handles, other, stream); }
+
+template<typename BufferT>
+template<typename T, typename util::enable_if<BufferTraits<T>::hasDeviceDual, int>::type>
+GridHandle<BufferT>::GridHandle(T&& buffer)
+{
+    static_assert(util::is_same<T,BufferT>::value, "Expected U==BufferT");
+    mBuffer = std::move(buffer);
+    if (auto *data = reinterpret_cast<const GridData*>(mBuffer.data())) {
+        if (!data->isValid()) throw std::runtime_error("GridHandle was constructed with an invalid host buffer");
+        mMetaData.resize(data->mGridCount);
+        cpyGridHandleMeta(data, mMetaData.data());
+    } else {
+        if (auto *d_data = reinterpret_cast<const GridData*>(mBuffer.deviceData())) {
+            GridData tmp;
+            cudaCheck(cudaMemcpy(&tmp, d_data, sizeof(GridData), cudaMemcpyDeviceToHost));
+            if (!tmp.isValid()) throw std::runtime_error("GridHandle was constructed with an invalid device buffer");
+            GridHandleMetaData *d_metaData;
+            cudaMalloc((void**)&d_metaData, tmp.mGridCount*sizeof(GridHandleMetaData));
+            cuda::cpyGridHandleMeta<<<1,1>>>(d_data, d_metaData);
+            mMetaData.resize(tmp.mGridCount);
+            cudaCheck(cudaMemcpy(mMetaData.data(), d_metaData,tmp.mGridCount*sizeof(GridHandleMetaData), cudaMemcpyDeviceToHost));
+            cudaCheck(cudaFree(d_metaData));
+        }
+    }
+}// GridHandle(T&& buffer)
+
+// Dummy function that ensures instantiation of the move-constructor above when BufferT=cuda::DeviceBuffer
+namespace {auto __dummy(){return GridHandle<cuda::DeviceBuffer>(std::move(cuda::DeviceBuffer()));}}
+
+} // namespace nanovdb
+
+#endif // NANOVDB_CUDA_GRIDHANDLE_CUH_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/cuda/NodeManager.cuh b/nanovdb/nanovdb/cuda/NodeManager.cuh
new file mode 100644
index 0000000000..8e9f24d0f8
--- /dev/null
+++ b/nanovdb/nanovdb/cuda/NodeManager.cuh
@@ -0,0 +1,104 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    \file nanovdb/cuda/NodeManager.cuh
+
+    \author Ken Museth
+
+    \date October 3, 2023
+
+    \brief Contains cuda kernels for NodeManager
+
+    \warning The header file contains cuda device code so be sure
+             to only include it in .cu files (or other .cuh files)
+*/
+
+#ifndef NANOVDB_CUDA_NODE_MANAGER_CUH_HAS_BEEN_INCLUDED
+#define NANOVDB_CUDA_NODE_MANAGER_CUH_HAS_BEEN_INCLUDED
+
+#include <nanovdb/util/cuda/Util.h>// for cuda::lambdaKernel
+#include <nanovdb/cuda/DeviceBuffer.h>
+#include <nanovdb/NodeManager.h>
+
+namespace nanovdb {
+
+namespace cuda {
+
+/// @brief Construct a NodeManager from a device grid pointer
+///
+/// @param d_grid device grid pointer whose nodes will be accessed sequentially
+/// @param buffer buffer from which to allocate the output handle
+/// @param stream cuda stream
+/// @return Handle that contains a device NodeManager
+template <typename BuildT, typename BufferT = DeviceBuffer>
+inline typename util::enable_if<BufferTraits<BufferT>::hasDeviceDual, NodeManagerHandle<BufferT>>::type
+createNodeManager(const NanoGrid<BuildT> *d_grid,
+                  const BufferT& pool = BufferT(),
+                  cudaStream_t stream = 0)
+{
+    auto buffer = BufferT::create(sizeof(NodeManagerData), &pool, false, stream);
+    auto *d_data = (NodeManagerData*)buffer.deviceData();
+    size_t size = 0u, *d_size;
+    cudaCheck(util::cuda::mallocAsync((void**)&d_size, sizeof(size_t), stream));
+    util::cuda::lambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {
+#ifdef NANOVDB_USE_NEW_MAGIC_NUMBERS
+        *d_data = NodeManagerData{NANOVDB_MAGIC_NODE,   0u, (void*)d_grid, {0u,0u,0u}};
+#else
+        *d_data = NodeManagerData{NANOVDB_MAGIC_NUMB, 0u, (void*)d_grid, {0u,0u,0u}};
+#endif
+        *d_size = sizeof(NodeManagerData);
+        auto &tree = d_grid->tree();
+        if (NodeManager<BuildT>::FIXED_SIZE && d_grid->isBreadthFirst()) {
+            d_data->mLinear = uint8_t(1u);
+            d_data->mOff[0] = util::PtrDiff(tree.template getFirstNode<0>(), d_grid);
+            d_data->mOff[1] = util::PtrDiff(tree.template getFirstNode<1>(), d_grid);
+            d_data->mOff[2] = util::PtrDiff(tree.template getFirstNode<2>(), d_grid);
+        } else {
+            *d_size += sizeof(uint64_t)*tree.totalNodeCount();
+        }
+    });
+    cudaCheckError();
+    cudaCheck(cudaMemcpyAsync(&size, d_size, sizeof(size_t), cudaMemcpyDeviceToHost, stream));
+    cudaCheck(util::cuda::freeAsync(d_size, stream));
+    if (size > sizeof(NodeManagerData)) {
+        auto tmp = BufferT::create(size, &pool, false, stream);// only allocate buffer on the device
+        cudaCheck(cudaMemcpyAsync(tmp.deviceData(), buffer.deviceData(), sizeof(NodeManagerData), cudaMemcpyDeviceToDevice, stream));
+        buffer = std::move(tmp);
+        d_data = reinterpret_cast<NodeManagerData*>(buffer.deviceData());
+        util::cuda::lambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__ (size_t) {
+            auto &tree = d_grid->tree();
+            int64_t *ptr0 = d_data->mPtr[0] = reinterpret_cast<int64_t*>(d_data + 1);
+            int64_t *ptr1 = d_data->mPtr[1] = d_data->mPtr[0] + tree.nodeCount(0);
+            int64_t *ptr2 = d_data->mPtr[2] = d_data->mPtr[1] + tree.nodeCount(1);
+            // Performs depth first traversal but breadth first insertion
+            for (auto it2 = tree.root().cbeginChild(); it2; ++it2) {
+                *ptr2++ = util::PtrDiff(&*it2, d_grid);
+                for (auto it1 = it2->cbeginChild(); it1; ++it1) {
+                    *ptr1++ = util::PtrDiff(&*it1, d_grid);
+                    for (auto it0 = it1->cbeginChild(); it0; ++it0) {
+                        *ptr0++ = util::PtrDiff(&*it0, d_grid);
+                    }// loop over child nodes of the lower internal node
+                }// loop over child nodes of the upper internal node
+            }// loop over child nodes of the root node
+        });
+    }
+
+    return NodeManagerHandle<BufferT>(toGridType<BuildT>(), std::move(buffer));
+}// cuda::createNodeManager
+
+}// namespace cuda
+
+template <typename BuildT, typename BufferT = cuda::DeviceBuffer>
+[[deprecated("Use cuda::createNodeManager instead")]]
+inline typename util::enable_if<BufferTraits<BufferT>::hasDeviceDual, NodeManagerHandle<BufferT>>::type
+cudaCreateNodeManager(const NanoGrid<BuildT> *d_grid,
+                      const BufferT& pool = BufferT(),
+                      cudaStream_t stream = 0)
+{
+    return cuda::createNodeManager<BuildT, BufferT>(d_grid, pool, stream);
+}
+
+} // namespace nanovdb
+
+#endif // NANOVDB_CUDA_NODE_MANAGER_CUH_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/examples/ex_bump_pool_buffer/bump_pool_buffer.cc b/nanovdb/nanovdb/examples/ex_bump_pool_buffer/bump_pool_buffer.cc
index 12edb019d5..bcd54036ec 100644
--- a/nanovdb/nanovdb/examples/ex_bump_pool_buffer/bump_pool_buffer.cc
+++ b/nanovdb/nanovdb/examples/ex_bump_pool_buffer/bump_pool_buffer.cc
@@ -1,9 +1,9 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-#include <nanovdb/util/Primitives.h>
-#include <nanovdb/util/IO.h>
-#include <nanovdb/util/HostBuffer.h>
+#include <nanovdb/tools/CreatePrimitives.h>
+#include <nanovdb/io/IO.h>
+#include <nanovdb/HostBuffer.h>
 #include <algorithm>
 
 //////////////////////////////////////////////
@@ -83,11 +83,11 @@ class PoolBuffer
 
     // Mandatory.
     // Return non-const pointer to the buffer data.
-    uint8_t*       data() { return mState->mPoolSlab.data() + mOffset; }
+    void*       data() { return nanovdb::util::PtrAdd(mState->mPoolSlab.data(), mOffset); }
 
     // Mandatory.
     // Return const pointer to the buffer data.
-    const uint8_t* data() const { return mState->mPoolSlab.data() + mOffset; }
+    const void* data() const { return nanovdb::util::PtrAdd(mState->mPoolSlab.data(), mOffset); }
 };
 
 // we specify this trait to avoid declaring the "device...(...)" convenience methods.
@@ -110,8 +110,8 @@ int main()
         std::vector<nanovdb::GridHandle<PoolBuffer>> gridHdls;
 
         // create two grids...
-        gridHdls.push_back(nanovdb::createLevelSetSphere<float >(100.0, nanovdb::Vec3d(-20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "spheref", nanovdb::StatsMode::BBox, nanovdb::ChecksumMode::Partial, bufferContext));
-        gridHdls.push_back(nanovdb::createLevelSetSphere<double>(100.0, nanovdb::Vec3d( 20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "sphered", nanovdb::StatsMode::BBox, nanovdb::ChecksumMode::Partial, bufferContext));
+        gridHdls.push_back(nanovdb::tools::createLevelSetSphere<float >(100.0, nanovdb::Vec3d(-20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "spheref", nanovdb::tools::StatsMode::BBox, nanovdb::CheckMode::Partial, bufferContext));
+        gridHdls.push_back(nanovdb::tools::createLevelSetSphere<double>(100.0, nanovdb::Vec3d( 20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "sphered", nanovdb::tools::StatsMode::BBox, nanovdb::CheckMode::Partial, bufferContext));
 
         // Get a (raw) pointer to the NanoVDB grid form the GridManager.
         auto* dstGrid = gridHdls[0].grid<float>();
diff --git a/nanovdb/nanovdb/examples/ex_collide_level_set/main.cc b/nanovdb/nanovdb/examples/ex_collide_level_set/main.cc
index 876c08e16a..5d0ae28475 100644
--- a/nanovdb/nanovdb/examples/ex_collide_level_set/main.cc
+++ b/nanovdb/nanovdb/examples/ex_collide_level_set/main.cc
@@ -3,12 +3,12 @@
 
 #include <algorithm>
 #include <iostream>
-#include <nanovdb/util/IO.h>
-#include <nanovdb/util/Primitives.h>
-#include <nanovdb/util/cuda/CudaDeviceBuffer.h>
+#include <nanovdb/io/IO.h>
+#include <nanovdb/tools/CreatePrimitives.h>
+#include <nanovdb/cuda/DeviceBuffer.h>
 
 #if defined(NANOVDB_USE_CUDA)
-using BufferT = nanovdb::CudaDeviceBuffer;
+using BufferT = nanovdb::cuda::DeviceBuffer;
 #else
 using BufferT = nanovdb::HostBuffer;
 #endif
@@ -26,7 +26,7 @@ int main(int ac, char** av)
             handle = nanovdb::io::readGrid<BufferT>(av[1]);
             std::cout << "Loaded NanoVDB grid[" << handle.gridMetaData()->shortGridName() << "]...\n";
         } else {
-            handle = nanovdb::createLevelSetSphere<float, BufferT>(100.0f, nanovdb::Vec3d(-20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "sphere");
+            handle = nanovdb::tools::createLevelSetSphere<float, BufferT>(100.0f, nanovdb::Vec3d(-20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "sphere");
         }
 
         if (handle.gridMetaData()->isLevelSet() == false) {
diff --git a/nanovdb/nanovdb/examples/ex_collide_level_set/nanovdb.cu b/nanovdb/nanovdb/examples/ex_collide_level_set/nanovdb.cu
index 71a976eca4..7eb9f2de06 100644
--- a/nanovdb/nanovdb/examples/ex_collide_level_set/nanovdb.cu
+++ b/nanovdb/nanovdb/examples/ex_collide_level_set/nanovdb.cu
@@ -5,15 +5,15 @@
 #include <cmath>
 #include <chrono>
 
-#include <nanovdb/util/IO.h>
-#include <nanovdb/util/cuda/CudaDeviceBuffer.h>
-#include <nanovdb/util/Ray.h>
-#include <nanovdb/util/HDDA.h>
+#include <nanovdb/io/IO.h>
+#include <nanovdb/cuda/DeviceBuffer.h>
+#include <nanovdb/math/Ray.h>
+#include <nanovdb/math/HDDA.h>
 
 #include "common.h"
 
 #if defined(NANOVDB_USE_CUDA)
-using BufferT = nanovdb::CudaDeviceBuffer;
+using BufferT = nanovdb::cuda::DeviceBuffer;
 #else
 using BufferT = nanovdb::HostBuffer;
 #endif
diff --git a/nanovdb/nanovdb/examples/ex_collide_level_set/openvdb.cc b/nanovdb/nanovdb/examples/ex_collide_level_set/openvdb.cc
index ec67f754bd..294ded6010 100644
--- a/nanovdb/nanovdb/examples/ex_collide_level_set/openvdb.cc
+++ b/nanovdb/nanovdb/examples/ex_collide_level_set/openvdb.cc
@@ -10,13 +10,13 @@
 #include <openvdb/openvdb.h>
 #include <openvdb/math/Ray.h>
 
-#include <nanovdb/util/cuda/CudaDeviceBuffer.h>
-#include <nanovdb/util/NanoToOpenVDB.h>
+#include <nanovdb/cuda/DeviceBuffer.h>
+#include <nanovdb/tools/NanoToOpenVDB.h>
 
 #include "common.h"
 
 #if defined(NANOVDB_USE_CUDA)
-using BufferT = nanovdb::CudaDeviceBuffer;
+using BufferT = nanovdb::cuda::DeviceBuffer;
 #else
 using BufferT = nanovdb::HostBuffer;
 #endif
@@ -29,9 +29,9 @@ void runOpenVDB(nanovdb::GridHandle<BufferT>& handle, int numIterations, int num
     using CoordT = openvdb::Coord;
     using RealT = float;
     using Vec3T = openvdb::math::Vec3<RealT>;
-    using RayT = openvdb::math::Ray<RealT>;
+    using RayT  = openvdb::math::Ray<RealT>;
 
-    auto srcGrid = nanovdb::nanoToOpenVDB(handle);
+    auto srcGrid = nanovdb::tools::nanoToOpenVDB(handle);
     std::cout << "Exporting to OpenVDB grid[" << srcGrid->getName() << "]...\n";
 
     auto h_grid = (GridT*)srcGrid.get();
diff --git a/nanovdb/nanovdb/examples/ex_index_grid_cuda/index_grid_cuda.cc b/nanovdb/nanovdb/examples/ex_index_grid_cuda/index_grid_cuda.cc
index b81d71c22b..1bb2a855d9 100644
--- a/nanovdb/nanovdb/examples/ex_index_grid_cuda/index_grid_cuda.cc
+++ b/nanovdb/nanovdb/examples/ex_index_grid_cuda/index_grid_cuda.cc
@@ -1,27 +1,27 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-#include <nanovdb/util/CreateNanoGrid.h>
-#include <nanovdb/util/Primitives.h>      // for nanovdb::createLevelSetSphere
-#include <nanovdb/util/cuda/CudaDeviceBuffer.h>// for nanovdb::CudaDeviceBuffer
+#include <nanovdb/tools/CreateNanoGrid.h>
+#include <nanovdb/tools/CreatePrimitives.h>// for nanovdb::tools::createLevelSetSphere
+#include <nanovdb/cuda/DeviceBuffer.h>// for nanovdb::cuda::DeviceBuffer
 
 extern "C" void launch_kernels(const nanovdb::NanoGrid<nanovdb::ValueOnIndex>*,// device grid
                                const nanovdb::NanoGrid<nanovdb::ValueOnIndex>*,// host grid
                                cudaStream_t stream);
 
-/// @brief This examples depends on  NanoVDB and CUDA.
+/// @brief This examples depends on NanoVDB and CUDA.
 int main(int, char**)
 {
     using SrcGridT  = nanovdb::FloatGrid;
     using DstBuildT = nanovdb::ValueOnIndex;
-    using BufferT   = nanovdb::CudaDeviceBuffer;
+    using BufferT   = nanovdb::cuda::DeviceBuffer;
     try {
         // Create an NanoVDB grid of a sphere at the origin with radius 100 and voxel size 1.
-        auto srcHandle = nanovdb::createLevelSetSphere<float>();
+        auto srcHandle = nanovdb::tools::createLevelSetSphere<float>();
         auto *srcGrid = srcHandle.grid<float>();
 
         // Converts the FloatGrid to an IndexGrid using CUDA for memory management.
-        auto idxHandle = nanovdb::createNanoGrid<SrcGridT, DstBuildT, BufferT>(*srcGrid, 1u, false , false);// 1 channel, no tiles or stats
+        auto idxHandle = nanovdb::tools::createNanoGrid<SrcGridT, DstBuildT, BufferT>(*srcGrid, 1u, false , false);// 1 channel, no tiles or stats
 
         cudaStream_t stream; // Create a CUDA stream to allow for asynchronous copy of pinned CUDA memory.
         cudaStreamCreate(&stream);
diff --git a/nanovdb/nanovdb/examples/ex_index_grid_cuda/index_grid_cuda_kernel.cu b/nanovdb/nanovdb/examples/ex_index_grid_cuda/index_grid_cuda_kernel.cu
index 5bb29979cf..ed1ae04100 100644
--- a/nanovdb/nanovdb/examples/ex_index_grid_cuda/index_grid_cuda_kernel.cu
+++ b/nanovdb/nanovdb/examples/ex_index_grid_cuda/index_grid_cuda_kernel.cu
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: MPL-2.0
 
 #include <nanovdb/NanoVDB.h> // this defined the core tree data structure of NanoVDB accessable on both the host and device
-#include <nanovdb/util/cuda/CudaGridHandle.cuh>// required since GridHandle<CudaDeviceBuffer> has device code
+#include <nanovdb/cuda/GridHandle.cuh>// required since GridHandle<DeviceBuffer> has device code
 #include <stdio.h> // for printf
 
 // This is called by the host only
diff --git a/nanovdb/nanovdb/examples/ex_make_custom_nanovdb/make_custom_nanovdb.cc b/nanovdb/nanovdb/examples/ex_make_custom_nanovdb/make_custom_nanovdb.cc
index aea2812a4b..6ee036eddb 100644
--- a/nanovdb/nanovdb/examples/ex_make_custom_nanovdb/make_custom_nanovdb.cc
+++ b/nanovdb/nanovdb/examples/ex_make_custom_nanovdb/make_custom_nanovdb.cc
@@ -1,25 +1,25 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-#include <nanovdb/util/GridBuilder.h>
-#include <nanovdb/util/CreateNanoGrid.h>
+#include <nanovdb/tools/GridBuilder.h>
+#include <nanovdb/tools/CreateNanoGrid.h>
 
 #include <iostream>
 
-/// @brief Creates a NanoVDB grids with custom values and access them.
+/// @brief Creates a NanoVDB grid with custom values and access them.
 ///
 /// @note This example only depends on NanoVDB.
 int main()
 {
     try {
-        nanovdb::build::Grid<float> grid(0.0f);
+        nanovdb::tools::build::Grid<float> grid(0.0f);
         auto acc = grid.getAccessor();
         acc.setValue(nanovdb::Coord(1, 2, 3), 1.0f);
 
         printf("build::Grid: (%i,%i,%i)=%4.2f\t", 1, 2, 3, acc.getValue(nanovdb::Coord(1, 2, 3)));
         printf("build::Grid: (%i,%i,%i)=%4.2f\n", 1, 2,-3, acc.getValue(nanovdb::Coord(1, 2,-3)));
 
-        auto handle = nanovdb::createNanoGrid(grid);
+        auto handle = nanovdb::tools::createNanoGrid(grid);
         auto* dstGrid = handle.grid<float>(); // Get a (raw) pointer to the NanoVDB grid form the GridManager.
         if (!dstGrid)
             throw std::runtime_error("GridHandle does not contain a grid with value type float");
diff --git a/nanovdb/nanovdb/examples/ex_make_custom_nanovdb_cuda/make_custom_nanovdb_cuda.cc b/nanovdb/nanovdb/examples/ex_make_custom_nanovdb_cuda/make_custom_nanovdb_cuda.cc
index 7b4da85f0a..1846e010e8 100644
--- a/nanovdb/nanovdb/examples/ex_make_custom_nanovdb_cuda/make_custom_nanovdb_cuda.cc
+++ b/nanovdb/nanovdb/examples/ex_make_custom_nanovdb_cuda/make_custom_nanovdb_cuda.cc
@@ -3,9 +3,9 @@
 
 #undef NANOVDB_USE_OPENVDB // Prevents include/openvdb/points/AttributeArray.h:1841:25: error: ‘stride’ cannot be used as a function
 
-#include <nanovdb/util/GridBuilder.h>
-#include <nanovdb/util/CreateNanoGrid.h>
-#include <nanovdb/util/cuda/CudaDeviceBuffer.h>
+#include <nanovdb/tools/GridBuilder.h>
+#include <nanovdb/tools/CreateNanoGrid.h>
+#include <nanovdb/cuda/DeviceBuffer.h>
 
 #include <iostream>
 
@@ -13,13 +13,13 @@ extern "C" void launch_kernels(const nanovdb::NanoGrid<float>*,// GPU grid
                                const nanovdb::NanoGrid<float>*,// CPU grid
                                cudaStream_t stream);
 
-/// @brief Creates a NanoVDB grids with custom values and access them.
+/// @brief Creates a NanoVDB grid with custom values and access them.
 ///
 /// @note This example only depends on NanoVDB.
 int main()
 {
     try {
-        using GridT = nanovdb::build::Grid<float>;
+        using GridT = nanovdb::tools::build::Grid<float>;
         GridT grid(0.0f);// empty grid with a background value of zero
         auto acc = grid.getAccessor();
         acc.setValue(nanovdb::Coord(1, 2, 3), 1.0f);
@@ -27,7 +27,7 @@ int main()
         printf("build::Grid: (%i,%i,%i)=%4.2f\n", 1, 2, 3, acc.getValue(nanovdb::Coord(1, 2, 3)));
 
         // convert build::grid to a nanovdb::GridHandle using a Cuda buffer
-        auto handle = nanovdb::createNanoGrid<GridT, float, nanovdb::CudaDeviceBuffer>(grid);
+        auto handle = nanovdb::tools::createNanoGrid<GridT, float, nanovdb::cuda::DeviceBuffer>(grid);
 
         auto* cpuGrid = handle.grid<float>(); //get a (raw) pointer to a NanoVDB grid of value type float on the CPU
         if (!cpuGrid) throw std::runtime_error("GridHandle does not contain a grid with value type float");
diff --git a/nanovdb/nanovdb/examples/ex_make_custom_nanovdb_cuda/make_custom_nanovdb_cuda_kernel.cu b/nanovdb/nanovdb/examples/ex_make_custom_nanovdb_cuda/make_custom_nanovdb_cuda_kernel.cu
index ae3556ad7a..0ab22d15ef 100644
--- a/nanovdb/nanovdb/examples/ex_make_custom_nanovdb_cuda/make_custom_nanovdb_cuda_kernel.cu
+++ b/nanovdb/nanovdb/examples/ex_make_custom_nanovdb_cuda/make_custom_nanovdb_cuda_kernel.cu
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: MPL-2.0
 
 #include <nanovdb/NanoVDB.h> // this defined the core tree data structure of NanoVDB accessable on both the host and device
-#include <nanovdb/util/cuda/CudaGridHandle.cuh>// required since GridHandle<CudaDeviceBuffer> has device code
+#include <nanovdb/cuda/GridHandle.cuh>// required since GridHandle<DeviceBuffer> has device code
 #include <stdio.h> // for printf
 
 // This is called by the host only
diff --git a/nanovdb/nanovdb/examples/ex_make_funny_nanovdb/make_funny_nanovdb.cc b/nanovdb/nanovdb/examples/ex_make_funny_nanovdb/make_funny_nanovdb.cc
index e9b7350bb8..7997fc78e6 100644
--- a/nanovdb/nanovdb/examples/ex_make_funny_nanovdb/make_funny_nanovdb.cc
+++ b/nanovdb/nanovdb/examples/ex_make_funny_nanovdb/make_funny_nanovdb.cc
@@ -1,13 +1,13 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-#include <nanovdb/util/GridBuilder.h>
-#include <nanovdb/util/CreateNanoGrid.h>
-#include <nanovdb/util/IO.h>
+#include <nanovdb/tools/GridBuilder.h>
+#include <nanovdb/tools/CreateNanoGrid.h>
+#include <nanovdb/io/IO.h>
 
 #include <iostream>
 
-/// @brief Creates a NanoVDB grids with custom values and access them.
+/// @brief Creates a NanoVDB grid with custom values and access them.
 ///
 /// @note This example only depends on NanoVDB.
 int main()
@@ -20,12 +20,12 @@ int main()
             float v = 40.0f + 50.0f*(cos(ijk[0]*0.1f)*sin(ijk[1]*0.1f) +
                                      cos(ijk[1]*0.1f)*sin(ijk[2]*0.1f) +
                                      cos(ijk[2]*0.1f)*sin(ijk[0]*0.1f));
-            v = Max(v, Vec3f(ijk).length() - size);// CSG intersection with a sphere
+            v = math::Max(v, Vec3f(ijk).length() - size);// CSG intersection with a sphere
             return v > background ? background : v < -background ? -background : v;// clamp value
         };
-        build::Grid<float> grid(background, "funny", GridClass::LevelSet);
+        tools::build::Grid<float> grid(background, "funny", GridClass::LevelSet);
         grid(func, CoordBBox(Coord(-size), Coord(size)));
-        io::writeGrid("data/funny.nvdb", createNanoGrid(grid), io::Codec::BLOSC);
+        io::writeGrid("data/funny.nvdb", tools::createNanoGrid(grid), io::Codec::BLOSC);
     }
     catch (const std::exception& e) {
         std::cerr << "An exception occurred: \"" << e.what() << "\"" << std::endl;
diff --git a/nanovdb/nanovdb/examples/ex_make_nanovdb_sphere/make_nanovdb_sphere.cc b/nanovdb/nanovdb/examples/ex_make_nanovdb_sphere/make_nanovdb_sphere.cc
index a711eb3f02..a7f8980a64 100644
--- a/nanovdb/nanovdb/examples/ex_make_nanovdb_sphere/make_nanovdb_sphere.cc
+++ b/nanovdb/nanovdb/examples/ex_make_nanovdb_sphere/make_nanovdb_sphere.cc
@@ -1,17 +1,17 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-#include <nanovdb/util/Primitives.h>
+#include <nanovdb/tools/CreatePrimitives.h>
 
 #include <iostream>
 
-/// @brief Creates a NanoVDB grids of a level set sphere and accesses a value.
+/// @brief Creates a NanoVDB grid of a level set sphere and access a value.
 ///
 /// @note This example only depends on NanoVDB.
 int main()
 {
     try {
-        auto handle = nanovdb::createLevelSetSphere(100.0f);
+        auto handle = nanovdb::tools::createLevelSetSphere(100.0f);
 
         auto* dstGrid = handle.grid<float>(); // Get a (raw) pointer to the NanoVDB grid form the GridManager.
         if (!dstGrid)
diff --git a/nanovdb/nanovdb/examples/ex_make_typed_grids/make_typed_grids.cc b/nanovdb/nanovdb/examples/ex_make_typed_grids/make_typed_grids.cc
index f9d4666784..bcb0ffce95 100644
--- a/nanovdb/nanovdb/examples/ex_make_typed_grids/make_typed_grids.cc
+++ b/nanovdb/nanovdb/examples/ex_make_typed_grids/make_typed_grids.cc
@@ -1,9 +1,9 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-#include <nanovdb/util/GridBuilder.h>
-#include <nanovdb/util/CreateNanoGrid.h>
-#include <nanovdb/util/IO.h>
+#include <nanovdb/tools/GridBuilder.h>
+#include <nanovdb/tools/CreateNanoGrid.h>
+#include <nanovdb/io/IO.h>
 
 // Helper struct to create a default value for the type.
 // We use a helper struct so we can specialize it for different types.
@@ -37,7 +37,7 @@ void buildGridForType(std::vector<nanovdb::GridHandle<>>& gridHandles, T const&
 
     try {
 
-        nanovdb::build::Grid<ValueT> grid(bgValue, typeNameStr);
+        nanovdb::tools::build::Grid<ValueT> grid(bgValue, typeNameStr);
         auto acc = grid.getAccessor();
         const int radius = 16;
         for (int z = -radius; z <= radius; ++z) {
@@ -49,7 +49,7 @@ void buildGridForType(std::vector<nanovdb::GridHandle<>>& gridHandles, T const&
                 }
             }
         }
-        gridHandles.push_back(nanovdb::createNanoGrid(grid));
+        gridHandles.push_back(nanovdb::tools::createNanoGrid(grid));
     }
     catch (const std::exception& e) {
         std::cerr << "An exception occurred: \"" << e.what() << "\"" << std::endl;
diff --git a/nanovdb/nanovdb/examples/ex_map_pool_buffer/map_pool_buffer.cc b/nanovdb/nanovdb/examples/ex_map_pool_buffer/map_pool_buffer.cc
index 526ed9c8cf..5b00ca30d8 100644
--- a/nanovdb/nanovdb/examples/ex_map_pool_buffer/map_pool_buffer.cc
+++ b/nanovdb/nanovdb/examples/ex_map_pool_buffer/map_pool_buffer.cc
@@ -1,9 +1,9 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-#include <nanovdb/util/Primitives.h>
-#include <nanovdb/util/IO.h>
-#include <nanovdb/util/HostBuffer.h>
+#include <nanovdb/tools/CreatePrimitives.h>
+#include <nanovdb/io/IO.h>
+#include <nanovdb/HostBuffer.h>
 #include <algorithm>
 
 //////////////////////////////////////////////
@@ -95,8 +95,8 @@ class MapPoolBuffer
     }
 
     uint64_t       size() const { return getBuffer(mId).size(); }
-    uint8_t*       data() { return getBuffer(mId).data(); }
-    const uint8_t* data() const { return getBuffer(mId).data(); }
+    void*          data() { return getBuffer(mId).data(); }
+    const void*    data() const { return getBuffer(mId).data(); }
 
     std::vector<uint64_t> getGridKeys() const
     {
@@ -148,8 +148,8 @@ int main()
         std::vector<nanovdb::GridHandle<MapPoolBuffer>> gridHdls;
 
         // create two grids...
-        gridHdls.push_back(nanovdb::createLevelSetSphere<float >(100.0, nanovdb::Vec3d(-20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "spheref", nanovdb::StatsMode::BBox, nanovdb::ChecksumMode::Partial, bufferContext));
-        gridHdls.push_back(nanovdb::createLevelSetSphere<double>(100.0, nanovdb::Vec3d( 20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "sphered", nanovdb::StatsMode::BBox, nanovdb::ChecksumMode::Partial, bufferContext));
+        gridHdls.push_back(nanovdb::tools::createLevelSetSphere<float >(100.0, nanovdb::Vec3d(-20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "spheref", nanovdb::tools::StatsMode::BBox, nanovdb::CheckMode::Partial, bufferContext));
+        gridHdls.push_back(nanovdb::tools::createLevelSetSphere<double>(100.0, nanovdb::Vec3d( 20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "sphered", nanovdb::tools::StatsMode::BBox, nanovdb::CheckMode::Partial, bufferContext));
 
         // share grid[0]'s buffer into a parent-scope handle to prevent deletion.
         anotherHdl = nanovdb::GridHandle<MapPoolBuffer>(bufferContext.copy(gridHdls[0].buffer().mId));
diff --git a/nanovdb/nanovdb/examples/ex_modify_nanovdb_thrust/modify_nanovdb_thrust.cc b/nanovdb/nanovdb/examples/ex_modify_nanovdb_thrust/modify_nanovdb_thrust.cc
index dbda5b3d73..54d369fd53 100644
--- a/nanovdb/nanovdb/examples/ex_modify_nanovdb_thrust/modify_nanovdb_thrust.cc
+++ b/nanovdb/nanovdb/examples/ex_modify_nanovdb_thrust/modify_nanovdb_thrust.cc
@@ -4,8 +4,8 @@
 /// @brief This examples demonstrates how values in a NanpVDB grid can be
 ///        modified on the device. It depends on NanoVDB and CUDA thrust.
 
-#include <nanovdb/util/Primitives.h>
-#include <nanovdb/util/cuda/CudaDeviceBuffer.h>
+#include <nanovdb/tools/CreatePrimitives.h>
+#include <nanovdb/cuda/DeviceBuffer.h>
 
 extern "C"  void scaleActiveVoxels(nanovdb::FloatGrid *grid_d, uint64_t leafCount, float scale);
 
@@ -13,7 +13,7 @@ int main()
 {
     try {
         // Create an NanoVDB grid of a sphere at the origin with radius 100 and voxel size 1.
-        auto handle = nanovdb::createLevelSetSphere<float, nanovdb::CudaDeviceBuffer>(100.0f);
+        auto handle = nanovdb::tools::createLevelSetSphere<float, nanovdb::cuda::DeviceBuffer>(100.0f);
         using GridT = nanovdb::FloatGrid;
 
         handle.deviceUpload(0, false); // Copy the NanoVDB grid to the GPU asynchronously
diff --git a/nanovdb/nanovdb/examples/ex_modify_nanovdb_thrust/modify_nanovdb_thrust.cu b/nanovdb/nanovdb/examples/ex_modify_nanovdb_thrust/modify_nanovdb_thrust.cu
index 1078b8aa1b..a8590d751b 100644
--- a/nanovdb/nanovdb/examples/ex_modify_nanovdb_thrust/modify_nanovdb_thrust.cu
+++ b/nanovdb/nanovdb/examples/ex_modify_nanovdb_thrust/modify_nanovdb_thrust.cu
@@ -7,8 +7,8 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/for_each.h>
 
-#include <nanovdb/util/cuda/CudaDeviceBuffer.h>
-#include <nanovdb/util/cuda/CudaGridHandle.cuh>
+#include <nanovdb/cuda/DeviceBuffer.h>
+#include <nanovdb/cuda/GridHandle.cuh>
 
 extern "C"  void scaleActiveVoxels(nanovdb::FloatGrid *grid_d, uint64_t leafCount, float scale)
 {
diff --git a/nanovdb/nanovdb/examples/ex_nodemanager_cuda/nodemanager_cuda.cc b/nanovdb/nanovdb/examples/ex_nodemanager_cuda/nodemanager_cuda.cc
index 68906b90e8..2ffe307866 100644
--- a/nanovdb/nanovdb/examples/ex_nodemanager_cuda/nodemanager_cuda.cc
+++ b/nanovdb/nanovdb/examples/ex_nodemanager_cuda/nodemanager_cuda.cc
@@ -2,22 +2,22 @@
 // SPDX-License-Identifier: MPL-2.0
 
 #include <openvdb/tools/LevelSetSphere.h> // replace with your own dependencies for generating the OpenVDB grid
-#include <nanovdb/util/CreateNanoGrid.h> // converter from OpenVDB to NanoVDB (includes NanoVDB.h and GridManager.h)
-#include <nanovdb/util/cuda/CudaDeviceBuffer.h>
-#include <nanovdb/util/NodeManager.h>
+#include <nanovdb/tools/CreateNanoGrid.h> // converter from OpenVDB to NanoVDB (includes NanoVDB.h and GridManager.h)
+#include <nanovdb/cuda/DeviceBuffer.h>
+#include <nanovdb/NodeManager.h>
 
-extern "C" void launch_kernels(const nanovdb::NodeManager<float>*,
-                               const nanovdb::NodeManager<float>*,
+extern "C" void launch_kernels(const nanovdb::NodeManager<float>*,// device NaodeManager
+                               const nanovdb::NodeManager<float>*,// host NodeManager
                                cudaStream_t stream);
 
-extern "C" void cudaCreateNodeManager(const nanovdb::NanoGrid<float>*,
-                                      nanovdb::NodeManagerHandle<nanovdb::CudaDeviceBuffer>*);
+extern "C" void cudaCreateNodeManager(const nanovdb::NanoGrid<float>*,// device grid
+                                      nanovdb::NodeManagerHandle<nanovdb::cuda::DeviceBuffer>*);// Handle to device NodeManager
 
 /// @brief This examples depends on OpenVDB, NanoVDB and CUDA.
 int main()
 {
     using SrcGridT = openvdb::FloatGrid;
-    using BufferT = nanovdb::CudaDeviceBuffer;
+    using BufferT = nanovdb::cuda::DeviceBuffer;
     try {
         cudaStream_t stream; // Create a CUDA stream to allow for asynchronous copy of pinned CUDA memory.
         cudaStreamCreate(&stream);
@@ -26,7 +26,7 @@ int main()
         auto srcGrid = openvdb::tools::createLevelSetSphere<SrcGridT>(100.0f, openvdb::Vec3f(0.0f), 1.0f);
 
         // Converts the OpenVDB to NanoVDB and returns a GridHandle that uses CUDA for memory management.
-        auto gridHandle = nanovdb::createNanoGrid<SrcGridT, float, BufferT>(*srcGrid);
+        auto gridHandle = nanovdb::tools::createNanoGrid<SrcGridT, float, BufferT>(*srcGrid);
         gridHandle.deviceUpload(stream, false); // Copy the NanoVDB grid to the GPU asynchronously
         auto* grid = gridHandle.grid<float>(); // get a (raw) pointer to a NanoVDB grid of value type float on the CPU
         auto* deviceGrid = gridHandle.deviceGrid<float>(); // get a (raw) pointer to a NanoVDB grid of value type float on the GPU
diff --git a/nanovdb/nanovdb/examples/ex_nodemanager_cuda/nodemanager_cuda_kernel.cu b/nanovdb/nanovdb/examples/ex_nodemanager_cuda/nodemanager_cuda_kernel.cu
index b06c87b4e5..246d53e9e6 100644
--- a/nanovdb/nanovdb/examples/ex_nodemanager_cuda/nodemanager_cuda_kernel.cu
+++ b/nanovdb/nanovdb/examples/ex_nodemanager_cuda/nodemanager_cuda_kernel.cu
@@ -2,9 +2,9 @@
 // SPDX-License-Identifier: MPL-2.0
 
 #include <nanovdb/NanoVDB.h> // this defined the core tree data structure of NanoVDB accessable on both the host and device
-#include <nanovdb/util/NodeManager.h>
-#include <nanovdb/util/cuda/CudaGridHandle.cuh>// required since GridHandle<CudaDeviceBuffer> has device code
-#include <nanovdb/util/cuda/CudaNodeManager.cuh>
+#include <nanovdb/NodeManager.h>
+#include <nanovdb/cuda/GridHandle.cuh>// required since GridHandle<CudaDeviceBuffer> has device code
+#include <nanovdb/cuda/NodeManager.cuh>
 #include <stdio.h> // for printf
 
 // This is called by the host only
@@ -29,9 +29,9 @@ extern "C" void launch_kernels(const nanovdb::NodeManager<float>* deviceMgr,
     cpu_kernel(cpuMgr); // Launch the host "kernel" (synchronously)
 }
 
-// Simple wrapper that makes sure nanovdb::cudaCreateNodeManager is initiated
+// Simple wrapper that makes sure nanovdb::cuda::createNodeManager is initiated
 extern "C" void cudaCreateNodeManager(const nanovdb::NanoGrid<float> *d_grid,
                                       nanovdb::NodeManagerHandle<nanovdb::CudaDeviceBuffer> *handle)
 {
-    *handle = std::move(nanovdb::cudaCreateNodeManager<float>(d_grid));
+    *handle = std::move(nanovdb::cuda::createNodeManager<float>(d_grid));
 }
\ No newline at end of file
diff --git a/nanovdb/nanovdb/examples/ex_openvdb_to_nanovdb/openvdb_to_nanovdb.cc b/nanovdb/nanovdb/examples/ex_openvdb_to_nanovdb/openvdb_to_nanovdb.cc
index 870114db39..68ca9bec83 100644
--- a/nanovdb/nanovdb/examples/ex_openvdb_to_nanovdb/openvdb_to_nanovdb.cc
+++ b/nanovdb/nanovdb/examples/ex_openvdb_to_nanovdb/openvdb_to_nanovdb.cc
@@ -2,8 +2,8 @@
 // SPDX-License-Identifier: MPL-2.0
 
 #include <openvdb/tools/LevelSetSphere.h> // replace with your own dependencies for generating the OpenVDB grid
-#include <nanovdb/util/CreateNanoGrid.h> // converter from OpenVDB to NanoVDB (includes NanoVDB.h and GridManager.h)
-#include <nanovdb/util/IO.h>
+#include <nanovdb/tools/CreateNanoGrid.h> // converter from OpenVDB to NanoVDB (includes NanoVDB.h and GridManager.h)
+#include <nanovdb/io/IO.h>
 
 /// @brief Convert an openvdb level set sphere into a nanovdb, access a single value in both grids, and save NanoVDB to file.
 ///
@@ -13,7 +13,7 @@ int main()
     try {
         // Create an OpenVDB grid of a sphere at the origin with radius 100 and voxel size 1.
         auto srcGrid = openvdb::tools::createLevelSetSphere<openvdb::FloatGrid>(100.0f, openvdb::Vec3f(0.0f), 1.0f);
-        auto handle = nanovdb::createNanoGrid(*srcGrid); // Convert from OpenVDB to NanoVDB and return a shared pointer to a GridHandle.
+        auto handle  = nanovdb::tools::createNanoGrid(*srcGrid); // Convert from OpenVDB to NanoVDB and return a shared pointer to a GridHandle.
         auto* dstGrid = handle.grid<float>(); // Get a (raw) pointer to the NanoVDB grid form the GridManager.
         if (!dstGrid)
             throw std::runtime_error("GridHandle does not contain a grid with value type float");
diff --git a/nanovdb/nanovdb/examples/ex_openvdb_to_nanovdb_accessor/openvdb_to_nanovdb_accessor.cc b/nanovdb/nanovdb/examples/ex_openvdb_to_nanovdb_accessor/openvdb_to_nanovdb_accessor.cc
index 4851732882..de7f3d62bb 100644
--- a/nanovdb/nanovdb/examples/ex_openvdb_to_nanovdb_accessor/openvdb_to_nanovdb_accessor.cc
+++ b/nanovdb/nanovdb/examples/ex_openvdb_to_nanovdb_accessor/openvdb_to_nanovdb_accessor.cc
@@ -2,8 +2,8 @@
 // SPDX-License-Identifier: MPL-2.0
 
 #include <openvdb/tools/LevelSetSphere.h> // replace with your own dependencies for generating the OpenVDB grid
-#include <nanovdb/util/CreateNanoGrid.h> // converter from OpenVDB to NanoVDB (includes NanoVDB.h and GridManager.h)
-#include <nanovdb/util/IO.h>
+#include <nanovdb/tools/CreateNanoGrid.h> // converter from OpenVDB to NanoVDB (includes NanoVDB.h and GridManager.h)
+#include <nanovdb/io/IO.h>
 
 // Convert an openvdb level set sphere into a nanovdb, use accessors to print out multiple values from both
 // grids and save the NanoVDB grid to file.
@@ -15,7 +15,7 @@ int main()
         auto srcGrid = openvdb::tools::createLevelSetSphere<openvdb::FloatGrid>(100.0f, openvdb::Vec3f(0.0f), 1.0f);
 
         // Convert the OpenVDB grid, srcGrid, into a NanoVDB grid handle.
-        auto handle = nanovdb::createNanoGrid(*srcGrid);
+        auto handle = nanovdb::tools::createNanoGrid(*srcGrid);
 
         // Define a (raw) pointer to the NanoVDB grid on the host. Note we match the value type of the srcGrid!
         auto* dstGrid = handle.grid<float>();
diff --git a/nanovdb/nanovdb/examples/ex_openvdb_to_nanovdb_cuda/openvdb_to_nanovdb_cuda.cc b/nanovdb/nanovdb/examples/ex_openvdb_to_nanovdb_cuda/openvdb_to_nanovdb_cuda.cc
index ae4d435dfc..ca4eacf1c8 100644
--- a/nanovdb/nanovdb/examples/ex_openvdb_to_nanovdb_cuda/openvdb_to_nanovdb_cuda.cc
+++ b/nanovdb/nanovdb/examples/ex_openvdb_to_nanovdb_cuda/openvdb_to_nanovdb_cuda.cc
@@ -2,8 +2,8 @@
 // SPDX-License-Identifier: MPL-2.0
 
 #include <openvdb/tools/LevelSetSphere.h> // replace with your own dependencies for generating the OpenVDB grid
-#include <nanovdb/util/CreateNanoGrid.h> // converter from OpenVDB to NanoVDB (includes NanoVDB.h and GridManager.h)
-#include <nanovdb/util/cuda/CudaDeviceBuffer.h>
+#include <nanovdb/tools/CreateNanoGrid.h> // converter from OpenVDB to NanoVDB (includes NanoVDB.h and GridManager.h)
+#include <nanovdb/cuda/DeviceBuffer.h>
 
 extern "C" void launch_kernels(const nanovdb::NanoGrid<float>*,
                                const nanovdb::NanoGrid<float>*,
@@ -18,7 +18,7 @@ int main(int, char**)
         auto srcGrid = openvdb::tools::createLevelSetSphere<SrcGridT>(100.0f, openvdb::Vec3f(0.0f), 1.0f);
 
         // Converts the OpenVDB to NanoVDB and returns a GridHandle that uses CUDA for memory management.
-        auto handle = nanovdb::createNanoGrid<SrcGridT, float, nanovdb::CudaDeviceBuffer>(*srcGrid);
+        auto handle = nanovdb::tools::createNanoGrid<SrcGridT, float, nanovdb::cuda::DeviceBuffer>(*srcGrid);
 
         cudaStream_t stream; // Create a CUDA stream to allow for asynchronous copy of pinned CUDA memory.
         cudaStreamCreate(&stream);
diff --git a/nanovdb/nanovdb/examples/ex_openvdb_to_nanovdb_cuda/openvdb_to_nanovdb_cuda_kernel.cu b/nanovdb/nanovdb/examples/ex_openvdb_to_nanovdb_cuda/openvdb_to_nanovdb_cuda_kernel.cu
index 543b0e3027..2db56ef9ef 100644
--- a/nanovdb/nanovdb/examples/ex_openvdb_to_nanovdb_cuda/openvdb_to_nanovdb_cuda_kernel.cu
+++ b/nanovdb/nanovdb/examples/ex_openvdb_to_nanovdb_cuda/openvdb_to_nanovdb_cuda_kernel.cu
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: MPL-2.0
 
 #include <nanovdb/NanoVDB.h> // this defined the core tree data structure of NanoVDB accessable on both the host and device
-#include <nanovdb/util/cuda/CudaGridHandle.cuh>// required since GridHandle<CudaDeviceBuffer> has device code
+#include <nanovdb/cuda/GridHandle.cuh>// required since GridHandle<DeviceBuffer> has device code
 #include <stdio.h> // for printf
 
 // This is called by the host only
diff --git a/nanovdb/nanovdb/examples/ex_raytrace_fog_volume/main.cc b/nanovdb/nanovdb/examples/ex_raytrace_fog_volume/main.cc
index 29752239f1..fb429494c6 100644
--- a/nanovdb/nanovdb/examples/ex_raytrace_fog_volume/main.cc
+++ b/nanovdb/nanovdb/examples/ex_raytrace_fog_volume/main.cc
@@ -3,12 +3,12 @@
 
 #include <algorithm>
 #include <iostream>
-#include <nanovdb/util/IO.h>
-#include <nanovdb/util/Primitives.h>
+#include <nanovdb/io/IO.h>
+#include <nanovdb/tools/CreatePrimitives.h>
 
 #if defined(NANOVDB_USE_CUDA)
-#include <nanovdb/util/cuda/CudaDeviceBuffer.h>
-using BufferT = nanovdb::CudaDeviceBuffer;
+#include <nanovdb/cuda/DeviceBuffer.h>
+using BufferT = nanovdb::cuda::DeviceBuffer;
 #else
 using BufferT = nanovdb::HostBuffer;
 #endif
@@ -26,7 +26,7 @@ int main(int ac, char** av)
             handle = nanovdb::io::readGrid<BufferT>(av[1]);
             std::cout << "Loaded NanoVDB grid[" << handle.gridMetaData()->shortGridName() << "]...\n";
         } else {
-            handle = nanovdb::createFogVolumeSphere<float, BufferT>(100.0f, nanovdb::Vec3d(-20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "sphere");
+            handle = nanovdb::tools::createFogVolumeSphere<float, BufferT>(100.0f, nanovdb::Vec3d(-20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "sphere");
         }
 
         if (handle.gridMetaData()->isFogVolume() == false) {
diff --git a/nanovdb/nanovdb/examples/ex_raytrace_fog_volume/nanovdb.cu b/nanovdb/nanovdb/examples/ex_raytrace_fog_volume/nanovdb.cu
index c65dfff85a..9e2c95a64c 100644
--- a/nanovdb/nanovdb/examples/ex_raytrace_fog_volume/nanovdb.cu
+++ b/nanovdb/nanovdb/examples/ex_raytrace_fog_volume/nanovdb.cu
@@ -6,14 +6,14 @@
 #include <chrono>
 
 #if defined(NANOVDB_USE_CUDA)
-#include <nanovdb/util/cuda/CudaDeviceBuffer.h>
-using BufferT = nanovdb::CudaDeviceBuffer;
+#include <nanovdb/cuda/DeviceBuffer.h>
+using BufferT = nanovdb::cuda::DeviceBuffer;
 #else
 using BufferT = nanovdb::HostBuffer;
 #endif
-#include <nanovdb/util/GridHandle.h>
-#include <nanovdb/util/IO.h>
-#include <nanovdb/util/Ray.h>
+#include <nanovdb/GridHandle.h>
+#include <nanovdb/io/IO.h>
+#include <nanovdb/math/Ray.h>
 
 #include "common.h"
 
@@ -22,10 +22,10 @@ void runNanoVDB(nanovdb::GridHandle<BufferT>& handle, int numIterations, int wid
     using GridT = nanovdb::FloatGrid;
     using CoordT = nanovdb::Coord;
     using RealT = float;
-    using Vec3T = nanovdb::Vec3<RealT>;
-    using RayT = nanovdb::Ray<RealT>;
+    using Vec3T = nanovdb::math::Vec3<RealT>;
+    using RayT = nanovdb::math::Ray<RealT>;
 
-    auto* h_grid = handle.grid<float>();
+    auto *h_grid = handle.grid<float>();
     if (!h_grid)
         throw std::runtime_error("GridHandle does not contain a valid host grid");
 
diff --git a/nanovdb/nanovdb/examples/ex_raytrace_fog_volume/openvdb.cc b/nanovdb/nanovdb/examples/ex_raytrace_fog_volume/openvdb.cc
index aaa9aa6a63..c41d597c61 100644
--- a/nanovdb/nanovdb/examples/ex_raytrace_fog_volume/openvdb.cc
+++ b/nanovdb/nanovdb/examples/ex_raytrace_fog_volume/openvdb.cc
@@ -11,13 +11,13 @@
 #include <openvdb/math/Ray.h>
 #include <openvdb/tools/LevelSetSphere.h>
 
-#include <nanovdb/util/cuda/CudaDeviceBuffer.h>
-#include <nanovdb/util/NanoToOpenVDB.h>
+#include <nanovdb/cuda/DeviceBuffer.h>
+#include <nanovdb/tools/NanoToOpenVDB.h>
 
 #include "common.h"
 
 #if defined(NANOVDB_USE_CUDA)
-using BufferT = nanovdb::CudaDeviceBuffer;
+using BufferT = nanovdb::cuda::DeviceBuffer;
 #else
 using BufferT = nanovdb::HostBuffer;
 #endif
@@ -30,7 +30,7 @@ void runOpenVDB(nanovdb::GridHandle<BufferT>& handle, int numIterations, int wid
     using Vec3T = openvdb::math::Vec3<RealT>;
     using RayT = openvdb::math::Ray<RealT>;
 
-    auto srcGrid = nanovdb::nanoToOpenVDB(handle);
+    auto srcGrid = nanovdb::tools::nanoToOpenVDB(handle);
     std::cout << "Exporting to OpenVDB grid[" << srcGrid->getName() << "]...\n";
 
     auto h_grid = (GridT*)srcGrid.get();
diff --git a/nanovdb/nanovdb/examples/ex_raytrace_level_set/main.cc b/nanovdb/nanovdb/examples/ex_raytrace_level_set/main.cc
index 5e066c20d7..88ef4125d2 100644
--- a/nanovdb/nanovdb/examples/ex_raytrace_level_set/main.cc
+++ b/nanovdb/nanovdb/examples/ex_raytrace_level_set/main.cc
@@ -3,12 +3,12 @@
 
 #include <algorithm>
 #include <iostream>
-#include <nanovdb/util/IO.h>
-#include <nanovdb/util/Primitives.h>
-#include <nanovdb/util/cuda/CudaDeviceBuffer.h>
+#include <nanovdb/io/IO.h>
+#include <nanovdb/tools/CreatePrimitives.h>
+#include <nanovdb/cuda/DeviceBuffer.h>
 
 #if defined(NANOVDB_USE_CUDA)
-using BufferT = nanovdb::CudaDeviceBuffer;
+using BufferT = nanovdb::cuda::DeviceBuffer;
 #else
 using BufferT = nanovdb::HostBuffer;
 #endif
@@ -26,7 +26,7 @@ int main(int ac, char** av)
             handle = nanovdb::io::readGrid<BufferT>(av[1]);
             std::cout << "Loaded NanoVDB grid[" << handle.gridMetaData()->shortGridName() << "]...\n";
         } else {
-            handle = nanovdb::createLevelSetSphere<float, BufferT>(100.0f, nanovdb::Vec3d(-20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "sphere");
+            handle = nanovdb::tools::createLevelSetSphere<float, BufferT>(100.0f, nanovdb::Vec3d(-20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "sphere");
         }
 
         if (handle.gridMetaData()->isLevelSet() == false) {
diff --git a/nanovdb/nanovdb/examples/ex_raytrace_level_set/nanovdb.cu b/nanovdb/nanovdb/examples/ex_raytrace_level_set/nanovdb.cu
index 14c8bd678d..d2864032e5 100644
--- a/nanovdb/nanovdb/examples/ex_raytrace_level_set/nanovdb.cu
+++ b/nanovdb/nanovdb/examples/ex_raytrace_level_set/nanovdb.cu
@@ -6,15 +6,15 @@
 #include <chrono>
 
 #if defined(NANOVDB_USE_CUDA)
-#include <nanovdb/util/cuda/CudaDeviceBuffer.h>
-using BufferT = nanovdb::CudaDeviceBuffer;
+#include <nanovdb/cuda/DeviceBuffer.h>
+using BufferT = nanovdb::cuda::DeviceBuffer;
 #else
 using BufferT = nanovdb::HostBuffer;
 #endif
-#include <nanovdb/util/GridHandle.h>
-#include <nanovdb/util/IO.h>
-#include <nanovdb/util/Ray.h>
-#include <nanovdb/util/HDDA.h>
+#include <nanovdb/GridHandle.h>
+#include <nanovdb/io/IO.h>
+#include <nanovdb/math/Ray.h>
+#include <nanovdb/math/HDDA.h>
 
 #include "common.h"
 
@@ -23,10 +23,10 @@ void runNanoVDB(nanovdb::GridHandle<BufferT>& handle, int numIterations, int wid
     using GridT = nanovdb::FloatGrid;
     using CoordT = nanovdb::Coord;
     using RealT = float;
-    using Vec3T = nanovdb::Vec3<RealT>;
-    using RayT = nanovdb::Ray<RealT>;
+    using Vec3T = nanovdb::math::Vec3<RealT>;
+    using RayT = nanovdb::math::Ray<RealT>;
 
-    auto* h_grid = handle.grid<float>();
+    auto *h_grid = handle.grid<float>();
     if (!h_grid)
         throw std::runtime_error("GridHandle does not contain a valid host grid");
 
@@ -58,7 +58,7 @@ void runNanoVDB(nanovdb::GridHandle<BufferT>& handle, int numIterations, int wid
             float  t0;
             CoordT ijk;
             float  v;
-            if (nanovdb::ZeroCrossing(iRay, acc, ijk, v, t0)) {
+            if (nanovdb::math::ZeroCrossing(iRay, acc, ijk, v, t0)) {
                 // write distance to surface. (we assume it is a uniform voxel)
                 float wT0 = t0 * float(grid->voxelSize()[0]);
                 compositeOp(image, i, width, height, wT0 / (wBBoxDimZ * 2), 1.0f);
diff --git a/nanovdb/nanovdb/examples/ex_raytrace_level_set/openvdb.cc b/nanovdb/nanovdb/examples/ex_raytrace_level_set/openvdb.cc
index c8a28e60eb..ef64f1d463 100644
--- a/nanovdb/nanovdb/examples/ex_raytrace_level_set/openvdb.cc
+++ b/nanovdb/nanovdb/examples/ex_raytrace_level_set/openvdb.cc
@@ -11,18 +11,18 @@
 #include <openvdb/math/Ray.h>
 #include <openvdb/tools/RayIntersector.h>
 
-#include <nanovdb/util/cuda/CudaDeviceBuffer.h>
-#include <nanovdb/util/NanoToOpenVDB.h>
+#include <nanovdb/cuda/DeviceBuffer.h>
+#include <nanovdb/tools/NanoToOpenVDB.h>
 
 #include "common.h"
 
 #if defined(NANOVDB_USE_CUDA)
-using BufferT = nanovdb::CudaDeviceBuffer;
+using BufferT = nanovdb::cuda::DeviceBuffer;
 #else
 using BufferT = nanovdb::HostBuffer;
 #endif
 
-void runOpenVDB(nanovdb::GridHandle<nanovdb::CudaDeviceBuffer>& handle, int numIterations, int width, int height, BufferT& imageBuffer)
+void runOpenVDB(nanovdb::GridHandle<nanovdb::cuda::DeviceBuffer>& handle, int numIterations, int width, int height, BufferT& imageBuffer)
 {
     using GridT = openvdb::FloatGrid;
     using CoordT = openvdb::Coord;
@@ -31,7 +31,7 @@ void runOpenVDB(nanovdb::GridHandle<nanovdb::CudaDeviceBuffer>& handle, int numI
     using RayT = openvdb::math::Ray<RealT>;
 
 #if 1
-    auto srcGrid = nanovdb::nanoToOpenVDB(handle);
+    auto srcGrid = nanovdb::tools::nanoToOpenVDB(handle);
     std::cout << "Exporting to OpenVDB grid[" << srcGrid->getName() << "]...\n";
 #else
     openvdb::initialize();
diff --git a/nanovdb/nanovdb/examples/ex_read_nanovdb_sphere/read_nanovdb_sphere.cc b/nanovdb/nanovdb/examples/ex_read_nanovdb_sphere/read_nanovdb_sphere.cc
index 76f56fe2dc..9bdf8f8f9d 100644
--- a/nanovdb/nanovdb/examples/ex_read_nanovdb_sphere/read_nanovdb_sphere.cc
+++ b/nanovdb/nanovdb/examples/ex_read_nanovdb_sphere/read_nanovdb_sphere.cc
@@ -1,7 +1,7 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-#include <nanovdb/util/IO.h>
+#include <nanovdb/io/IO.h>
 
 /// @brief Read a NanoVDB grid form file, check pointer and access a single value
 ///
diff --git a/nanovdb/nanovdb/examples/ex_read_nanovdb_sphere_accessor/read_nanovdb_sphere_accessor.cc b/nanovdb/nanovdb/examples/ex_read_nanovdb_sphere_accessor/read_nanovdb_sphere_accessor.cc
index c9cf0b0883..91010b6cf7 100644
--- a/nanovdb/nanovdb/examples/ex_read_nanovdb_sphere_accessor/read_nanovdb_sphere_accessor.cc
+++ b/nanovdb/nanovdb/examples/ex_read_nanovdb_sphere_accessor/read_nanovdb_sphere_accessor.cc
@@ -1,4 +1,4 @@
-#include <nanovdb/util/IO.h> // this is required to read (and write) NanoVDB files on the host
+#include <nanovdb/io/IO.h> // this is required to read (and write) NanoVDB files on the host
 
 /// @brief Read a NanoVDB grid from a file and print out multiple values.
 ///
diff --git a/nanovdb/nanovdb/examples/ex_read_nanovdb_sphere_accessor_cuda/read_nanovdb_sphere_accessor_cuda.cu b/nanovdb/nanovdb/examples/ex_read_nanovdb_sphere_accessor_cuda/read_nanovdb_sphere_accessor_cuda.cu
index 4343e01420..c07393a77f 100644
--- a/nanovdb/nanovdb/examples/ex_read_nanovdb_sphere_accessor_cuda/read_nanovdb_sphere_accessor_cuda.cu
+++ b/nanovdb/nanovdb/examples/ex_read_nanovdb_sphere_accessor_cuda/read_nanovdb_sphere_accessor_cuda.cu
@@ -1,9 +1,9 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-#include <nanovdb/util/IO.h> // this is required to read (and write) NanoVDB files on the host
-#include <nanovdb/util/cuda/CudaDeviceBuffer.h> // required for CUDA memory management
-#include <nanovdb/util/GridHandle.h>
+#include <nanovdb/io/IO.h> // this is required to read (and write) NanoVDB files on the host
+#include <nanovdb/cuda/DeviceBuffer.h> // required for CUDA memory management
+#include <nanovdb/GridHandle.h>
 
 extern "C" void launch_kernels(const nanovdb::NanoGrid<float>*,
                                const nanovdb::NanoGrid<float>*,
diff --git a/nanovdb/nanovdb/examples/ex_vox_to_nanovdb/VoxToNanoVDB.h b/nanovdb/nanovdb/examples/ex_vox_to_nanovdb/VoxToNanoVDB.h
index 98bacb538e..ab9316ad33 100644
--- a/nanovdb/nanovdb/examples/ex_vox_to_nanovdb/VoxToNanoVDB.h
+++ b/nanovdb/nanovdb/examples/ex_vox_to_nanovdb/VoxToNanoVDB.h
@@ -3,8 +3,8 @@
 
 #pragma once
 
-#include <nanovdb/util/GridBuilder.h>
-#include <nanovdb/util/CreateNanoGrid.h>
+#include <nanovdb/tools/GridBuilder.h>
+#include <nanovdb/tools/CreateNanoGrid.h>
 
 #define OGT_VOX_IMPLEMENTATION
 #include "ogt_vox.h"
@@ -132,7 +132,7 @@ nanovdb::GridHandle<BufferT> convertVoxToNanoVDB(const std::string& inFilename,
     try {
         if (const auto* scene = detail::load_vox_scene(inFilename.c_str())) {
             // we just merge into one grid...
-            nanovdb::build::Grid<nanovdb::Rgba8> grid(nanovdb::Rgba8(),modelName,nanovdb::GridClass::VoxelVolume);
+            nanovdb::tools::build::Grid<nanovdb::math::Rgba8> grid(nanovdb::math::Rgba8(),modelName,nanovdb::GridClass::VoxelVolume);
             auto acc = grid.getAccessor();
 
             auto processModelFn = [&](int modelIndex, const ogt_vox_transform& xform) {
@@ -145,7 +145,7 @@ nanovdb::GridHandle<BufferT> convertVoxToNanoVDB(const std::string& inFilename,
                             if (uint8_t color_index = model->voxel_data[voxel_index]) {
                                 ogt_vox_rgba rgba = scene->palette.color[color_index];
                                 auto ijk = nanovdb::Coord::Floor(detail::matMult4x4((float*)&xform, nanovdb::Vec4f(x, y, z, 1)));
-                                acc.setValue(nanovdb::Coord(ijk[0], ijk[2], -ijk[1]), *reinterpret_cast<nanovdb::PackedRGBA8*>(&rgba));
+                                acc.setValue(nanovdb::Coord(ijk[0], ijk[2], -ijk[1]), *reinterpret_cast<nanovdb::math::Rgba8*>(&rgba));
                             }
                         }
                     }
@@ -185,7 +185,7 @@ nanovdb::GridHandle<BufferT> convertVoxToNanoVDB(const std::string& inFilename,
 
             printf("scene processing end.\n");
             ogt_vox_destroy_scene(scene);
-            return nanovdb::createNanoGrid(grid);
+            return nanovdb::tools::createNanoGrid(grid);
         } else {
             std::ostringstream ss;
             ss << "Invalid file \"" << inFilename << "\"";
diff --git a/nanovdb/nanovdb/examples/ex_vox_to_nanovdb/vox_to_nanovdb.cc b/nanovdb/nanovdb/examples/ex_vox_to_nanovdb/vox_to_nanovdb.cc
index bf6d528e1c..27b1b67d2b 100644
--- a/nanovdb/nanovdb/examples/ex_vox_to_nanovdb/vox_to_nanovdb.cc
+++ b/nanovdb/nanovdb/examples/ex_vox_to_nanovdb/vox_to_nanovdb.cc
@@ -1,7 +1,7 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-#include <nanovdb/util/IO.h>
+#include <nanovdb/io/IO.h>
 #include "VoxToNanoVDB.h"
 
 /// @brief Convert an .vox file into a .nvdb file.
diff --git a/nanovdb/nanovdb/examples/ex_voxels_to_grid_cuda/ex_voxels_to_grid_cuda.cu b/nanovdb/nanovdb/examples/ex_voxels_to_grid_cuda/ex_voxels_to_grid_cuda.cu
index d3ca1d8883..83ffc37067 100644
--- a/nanovdb/nanovdb/examples/ex_voxels_to_grid_cuda/ex_voxels_to_grid_cuda.cu
+++ b/nanovdb/nanovdb/examples/ex_voxels_to_grid_cuda/ex_voxels_to_grid_cuda.cu
@@ -1,22 +1,21 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-#include <nanovdb/util/cuda/CudaPointsToGrid.cuh>
+#include <nanovdb/util/cuda/Util.h>
+#include <nanovdb/tools/cuda/PointsToGrid.cuh>
 
 /// @brief Demonstrates how to create a NanoVDB grid from voxel coordinates on the GPU
 int main()
 {
-    using namespace nanovdb;
-
     try {
         // Define list of voxel coordinates and copy them to the device
         const size_t numVoxels = 3;
-        Coord coords[numVoxels] = {Coord(1, 2, 3), Coord(-1,3,6), Coord(-90,100,5678)}, *d_coords = nullptr;
-        cudaCheck(cudaMalloc(&d_coords, numVoxels * sizeof(Coord)));
-        cudaCheck(cudaMemcpy(d_coords, coords, numVoxels * sizeof(Coord), cudaMemcpyHostToDevice));// coords CPU -> GPU
+        nanovdb::Coord coords[numVoxels] = {nanovdb::Coord(1, 2, 3), nanovdb::Coord(-1,3,6), nanovdb::Coord(-90,100,5678)}, *d_coords = nullptr;
+        cudaCheck(cudaMalloc(&d_coords, numVoxels * sizeof(nanovdb::Coord)));
+        cudaCheck(cudaMemcpy(d_coords, coords, numVoxels * sizeof(nanovdb::Coord), cudaMemcpyHostToDevice));// coords CPU -> GPU
 
         // Generate a NanoVDB grid that contains the list of voxels on the device
-        auto handle = cudaVoxelsToGrid<float>(d_coords, numVoxels);
+        auto handle = nanovdb::tools::cuda::voxelsToGrid<float>(d_coords, numVoxels);
         auto *d_grid = handle.deviceGrid<float>();
 
         // Define a list of values and copy them to the device
@@ -25,10 +24,10 @@ int main()
         cudaCheck(cudaMemcpy(d_values, values, numVoxels * sizeof(float), cudaMemcpyHostToDevice));// values CPU -> GPU
 
         // Launch a device kernel that sets the values of voxels define above and prints them
-        const unsigned int numThreads = 128, numBlocks = (numVoxels + numThreads - 1) / numThreads;
-        cudaLambdaKernel<<<numBlocks, numThreads>>>(numVoxels, [=] __device__(size_t tid) {
-            using OpT = SetVoxel<float>;// defines type of random-access operation (set value)
-            const Coord &ijk = d_coords[tid];
+        const unsigned int numThreads = 128, numBlocks = nanovdb::util::cuda::blocksPerGrid(numVoxels, numThreads);
+        nanovdb::util::cuda::lambdaKernel<<<numBlocks, numThreads>>>(numVoxels, [=] __device__(size_t tid) {
+            using OpT = nanovdb::SetVoxel<float>;// defines type of random-access operation (set value)
+            const nanovdb::Coord &ijk = d_coords[tid];
             d_grid->tree().set<OpT>(ijk, d_values[tid]);// normally one should use a ValueAccessor
             printf("GPU: voxel # %lu, grid(%4i,%4i,%4i) = %5.1f\n", tid, ijk[0], ijk[1], ijk[2], d_grid->tree().getValue(ijk));
         }); cudaCheckError();
@@ -37,7 +36,7 @@ int main()
         handle.deviceDownload();// creates a copy on the CPU
         auto *grid = handle.grid<float>();
         for (size_t i=0; i<numVoxels; ++i) {
-            const Coord &ijk = coords[i];
+            const nanovdb::Coord &ijk = coords[i];
             printf("CPU: voxel # %lu, grid(%4i,%4i,%4i) = %5.1f\n", i, ijk[0], ijk[1], ijk[2], grid->tree().getValue(ijk));
         }
 
diff --git a/nanovdb/nanovdb/examples/ex_write_nanovdb_grids/write_nanovdb_grids.cc b/nanovdb/nanovdb/examples/ex_write_nanovdb_grids/write_nanovdb_grids.cc
index 314fe4ea57..2f9465e62e 100644
--- a/nanovdb/nanovdb/examples/ex_write_nanovdb_grids/write_nanovdb_grids.cc
+++ b/nanovdb/nanovdb/examples/ex_write_nanovdb_grids/write_nanovdb_grids.cc
@@ -1,8 +1,8 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-#include <nanovdb/util/Primitives.h>
-#include <nanovdb/util/IO.h>
+#include <nanovdb/tools/CreatePrimitives.h>
+#include <nanovdb/io/IO.h>
 
 /// @brief Creates multiple NanoVDB grids, accesses a value in one, and saves all grids to file.
 ///
@@ -12,11 +12,11 @@ int main()
     try {
         std::vector<nanovdb::GridHandle<>> handles;
         // Create multiple NanoVDB grids of various types
-        handles.push_back(nanovdb::createLevelSetSphere<float>(100.0f));
-        handles.push_back(nanovdb::createLevelSetTorus<float>(100.0f, 50.0f));
-        handles.push_back(nanovdb::createLevelSetBox<float>(400.0f, 600.0f, 800.0f));
-        handles.push_back(nanovdb::createLevelSetBBox<float>(400.0f, 600.0f, 800.0f, 10.0f));
-        handles.push_back(nanovdb::createPointSphere<float>(1, 100.0f));
+        handles.push_back(nanovdb::tools::createLevelSetSphere<float>(100.0f));
+        handles.push_back(nanovdb::tools::createLevelSetTorus<float>(100.0f, 50.0f));
+        handles.push_back(nanovdb::tools::createLevelSetBox<float>(400.0f, 600.0f, 800.0f));
+        handles.push_back(nanovdb::tools::createLevelSetBBox<float>(400.0f, 600.0f, 800.0f, 10.0f));
+        handles.push_back(nanovdb::tools::createPointSphere<float>(1, 100.0f));
 
         auto* dstGrid = handles[0].grid<float>(); // Get a (raw) pointer to the NanoVDB grid form the GridManager.
         if (!dstGrid)
diff --git a/nanovdb/nanovdb/io/IO.h b/nanovdb/nanovdb/io/IO.h
new file mode 100644
index 0000000000..3c6259140f
--- /dev/null
+++ b/nanovdb/nanovdb/io/IO.h
@@ -0,0 +1,767 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    \file IO.h
+
+    \author Ken Museth
+
+    \date May 1, 2020
+
+    \brief Implements I/O for NanoVDB grids. Features optional BLOSC and ZIP
+           file compression, support for multiple grids per file as well as
+           multiple grid types.
+
+    \note  This file does NOT depend on OpenVDB, but optionally on ZIP and BLOSC
+
+    \details NanoVDB files take on of two formats:
+             1) multiple segments each with multiple grids (segments have easy to access metadata about its grids)
+             2) starting with verion 32.6.0 nanovdb files also support a raw buffer with one or more grids (just a
+             dump of a raw grid buffer, so no new metadata in headers as when using segments mentioned above).
+
+    // 1: Segment:  FileHeader, MetaData0, gridName0...MetaDataN, gridNameN, compressed Grid0, ... compressed GridN
+    // 2: Raw: Grid0, ... GridN
+*/
+
+#ifndef NANOVDB_IO_H_HAS_BEEN_INCLUDED
+#define NANOVDB_IO_H_HAS_BEEN_INCLUDED
+
+#include <nanovdb/NanoVDB.h>
+#include <nanovdb/GridHandle.h>
+#include <nanovdb/tools/GridChecksum.h>// for updateGridCount
+
+#include <fstream> // for std::ifstream
+#include <iostream> // for std::cerr/cout
+#include <string> // for std::string
+#include <sstream> // for std::stringstream
+#include <cstring> // for std::strcmp
+#include <memory> // for std::unique_ptr
+#include <vector> // for std::vector
+#ifdef NANOVDB_USE_ZIP
+#include <zlib.h> // for ZIP compression
+#endif
+#ifdef NANOVDB_USE_BLOSC
+#include <blosc.h> // for BLOSC compression
+#endif
+
+// Due to a bug in older versions of gcc, including fstream might
+// define "major" and "minor" which are used as member data below.
+// See https://bugzilla.redhat.com/show_bug.cgi?id=130601
+#if defined(major) || defined(minor)
+#undef major
+#undef minor
+#endif
+
+namespace nanovdb {// ==========================================================
+
+namespace io {// ===============================================================
+
+// --------------------------> writeGrid(s) <------------------------------------
+
+/// @brief Write a single grid to file (over-writing existing content of the file)
+///
+/// @note The single grid is written into a single segment, i.e. header with metadata about its type and size.
+template<typename BufferT>
+void writeGrid(const std::string& fileName, const GridHandle<BufferT>& handle, io::Codec codec = io::Codec::NONE, int verbose = 0);
+
+/// @brief Write multiple grids to file (over-writing existing content of the file)
+///
+/// @note The multiple grids are written into the same segment, i.e. header with metadata about all grids
+template<typename BufferT = HostBuffer, template<typename...> class VecT = std::vector>
+void writeGrids(const std::string& fileName, const VecT<GridHandle<BufferT>>& handles, Codec codec = Codec::NONE, int verbose = 0);
+
+// --------------------------> readGrid(s) <------------------------------------
+
+/// @brief Read and return one or all grids from a file into a single GridHandle
+/// @tparam BufferT Type of buffer used memory allocation
+/// @param fileName string name of file to be read from
+/// @param n zero-based signed index of the grid to be read.
+///          The default value of 0 means read only first grid.
+///          A negative value of n means read all grids in the file.
+/// @param verbose specify verbosity level. Default value of zero means quiet.
+/// @param buffer optional buffer used for memory allocation
+/// @return return a single GridHandle with one or all grids found in the file
+/// @throw will throw a std::runtime_error if the file does not contain a grid with index n
+template<typename BufferT = HostBuffer>
+GridHandle<BufferT> readGrid(const std::string& fileName, int n = 0, int verbose = 0, const BufferT& buffer = BufferT());
+
+/// @brief Read and return the first grid with a specific name from a file
+/// @tparam BufferT Type of buffer used memory allocation
+/// @param fileName string name of file to be read from
+/// @param gridName string name of the grid to be read
+/// @param verbose specify verbosity level. Default value of zero means quiet.
+/// @param buffer  optional buffer used for memory allocation
+/// @return return a single GridHandle containing the grid with the specific name
+/// @throw will throw a std::runtime_error if the file does not contain a grid with the specific name
+template<typename BufferT = HostBuffer>
+GridHandle<BufferT> readGrid(const std::string& fileName, const std::string& gridName, int verbose = 0, const BufferT& buffer = BufferT());
+
+/// @brief Read all the grids in the file and return them as a vector of multiple GridHandles, each containing
+///        all grids encoded in the same segment of the file (i.e. they where written together). This method also
+///        works if the file contains a raw grid buffer in which case a single GridHandle is returned.
+/// @tparam BufferT Type of buffer used memory allocation
+/// @param fileName string name of file to be read from
+/// @param verbose specify verbosity level. Default value of zero means quiet.
+/// @param buffer  optional buffer used for memory allocation
+/// @return Return a vector of GridHandles each containing all grids encoded
+///         in the same segment of the file (i.e. they where written together).
+template<typename BufferT = HostBuffer, template<typename...> class VecT = std::vector>
+VecT<GridHandle<BufferT>> readGrids(const std::string& fileName, int verbose = 0, const BufferT& buffer = BufferT());
+
+// -----------------------------------------------------------------------
+
+/// We fix a specific size for counting bytes in files so that they
+/// are saved the same regardless of machine precision.  (Note there are
+/// still little/bigendian issues, however)
+using fileSize_t = uint64_t;
+
+/// @brief Internal functions for compressed read/write of a NanoVDB GridHandle into a stream
+///
+/// @warning These functions should never be called directly by client code
+namespace Internal {
+static constexpr fileSize_t MAX_SIZE = 1UL << 30; // size is 1 GB
+
+template<typename BufferT>
+static fileSize_t write(std::ostream& os, const GridHandle<BufferT>& handle, Codec codec, uint32_t n);
+
+template<typename BufferT>
+static void read(std::istream& is, BufferT& buffer, Codec codec);
+
+static void read(std::istream& is, char* data, fileSize_t size, Codec codec);
+} // namespace Internal
+
+/// @brief Standard hash function to use on strings; std::hash may vary by
+///        platform/implementation and is know to produce frequent collisions.
+uint64_t stringHash(const char* cstr);
+
+/// @brief Return a uint64_t hash key of a std::string
+inline uint64_t stringHash(const std::string& str){return stringHash(str.c_str());}
+
+/// @brief Return a uint64_t with its bytes reversed so we can check for endianness
+inline uint64_t reverseEndianness(uint64_t val)
+{
+    return (((val) >> 56) & 0x00000000000000FF) | (((val) >> 40) & 0x000000000000FF00) |
+           (((val) >> 24) & 0x0000000000FF0000) | (((val) >>  8) & 0x00000000FF000000) |
+           (((val) <<  8) & 0x000000FF00000000) | (((val) << 24) & 0x0000FF0000000000) |
+           (((val) << 40) & 0x00FF000000000000) | (((val) << 56) & 0xFF00000000000000);
+}
+
+/// @brief This class defines the meta data stored for each grid in a segment
+///
+/// @details A segment consists of a FileHeader followed by a list of FileGridMetaData
+///          each followed by grid names and then finally the grids themselves.
+///
+/// @note This class should not be confused with nanovdb::GridMetaData defined in NanoVDB.h
+///       Also, io::FileMetaData is defined in NanoVDB.h.
+struct FileGridMetaData : public FileMetaData
+{
+    static_assert(sizeof(FileMetaData) == 176, "Unexpected sizeof(FileMetaData)");
+    std::string gridName;
+    void        read(std::istream& is);
+    void        write(std::ostream& os) const;
+    FileGridMetaData() {}
+    FileGridMetaData(uint64_t size, Codec c, const GridData &gridData);
+    uint64_t memUsage() const { return sizeof(FileMetaData) + nameSize; }
+}; // FileGridMetaData
+
+/// @brief This class defines all the data stored in segment of a file
+///
+/// @details A segment consists of a FileHeader followed by a list of FileGridMetaData
+///          each followed by grid names and then finally the grids themselves.
+struct Segment
+{
+    // Check assumptions made during read and write of FileHeader and FileMetaData
+    static_assert(sizeof(FileHeader) == 16u, "Unexpected sizeof(FileHeader)");
+    FileHeader header;// defined in NanoVDB.h
+    std::vector<FileGridMetaData> meta;// defined in NanoVDB.h
+    Segment(Codec c = Codec::NONE)
+#ifdef NANOVDB_USE_NEW_MAGIC_NUMBERS
+        : header{NANOVDB_MAGIC_FILE, Version(), 0u, c}
+#else
+        : header{NANOVDB_MAGIC_NUMB, Version(), 0u, c}
+#endif
+        , meta()
+    {
+    }
+    template<typename BufferT>
+    void     add(const GridHandle<BufferT>& h);
+    bool     read(std::istream& is);
+    void     write(std::ostream& os) const;
+    uint64_t memUsage() const;
+}; // Segment
+
+/// @brief Return true if the file contains a grid with the specified name
+bool hasGrid(const std::string& fileName, const std::string& gridName);
+
+/// @brief Return true if the stream contains a grid with the specified name
+bool hasGrid(std::istream& is, const std::string& gridName);
+
+/// @brief Reads and returns a vector of meta data for all the grids found in the specified file
+std::vector<FileGridMetaData> readGridMetaData(const std::string& fileName);
+
+/// @brief Reads and returns a vector of meta data for all the grids found in the specified stream
+std::vector<FileGridMetaData> readGridMetaData(std::istream& is);
+
+// --------------------------> Implementations for Internal <------------------------------------
+
+template<typename BufferT>
+fileSize_t Internal::write(std::ostream& os, const GridHandle<BufferT>& handle, Codec codec, unsigned int n)
+{
+    const char* data = reinterpret_cast<const char*>(handle.gridData(n));
+    fileSize_t  total = 0, residual = handle.gridSize(n);
+
+    switch (codec) {
+    case Codec::ZIP: {
+#ifdef NANOVDB_USE_ZIP
+        uLongf                   size = compressBound(static_cast<uLongf>(residual)); // Get an upper bound on the size of the compressed data.
+        std::unique_ptr<Bytef[]> tmp(new Bytef[size]);
+        const int                status = compress(tmp.get(), &size, reinterpret_cast<const Bytef*>(data), static_cast<uLongf>(residual));
+        if (status != Z_OK) std::runtime_error("Internal write error in ZIP");
+        if (size > residual) std::cerr << "\nWarning: Unexpected ZIP compression from " << residual << " to " << size << " bytes\n";
+        const fileSize_t outBytes = size;
+        os.write(reinterpret_cast<const char*>(&outBytes), sizeof(fileSize_t));
+        os.write(reinterpret_cast<const char*>(tmp.get()), outBytes);
+        total += sizeof(fileSize_t) + outBytes;
+#else
+        throw std::runtime_error("ZIP compression codec was disabled during build");
+#endif
+        break;
+    }
+    case Codec::BLOSC: {
+#ifdef NANOVDB_USE_BLOSC
+        do {
+            fileSize_t              chunk = residual < MAX_SIZE ? residual : MAX_SIZE, size = chunk + BLOSC_MAX_OVERHEAD;
+            std::unique_ptr<char[]> tmp(new char[size]);
+            const int               count = blosc_compress_ctx(9, 1, sizeof(float), chunk, data, tmp.get(), size, BLOSC_LZ4_COMPNAME, 1 << 18, 1);
+            if (count <= 0) std::runtime_error("Internal write error in BLOSC");
+            const fileSize_t outBytes = count;
+            os.write(reinterpret_cast<const char*>(&outBytes), sizeof(fileSize_t));
+            os.write(reinterpret_cast<const char*>(tmp.get()), outBytes);
+            total += sizeof(fileSize_t) + outBytes;
+            data += chunk;
+            residual -= chunk;
+        } while (residual > 0);
+#else
+        throw std::runtime_error("BLOSC compression codec was disabled during build");
+#endif
+        break;
+    }
+    default:
+        os.write(data, residual);
+        total += residual;
+    }
+    if (!os) throw std::runtime_error("Failed to write Tree to file");
+    return total;
+} // Internal::write
+
+template<typename BufferT>
+void Internal::read(std::istream& is, BufferT& buffer, Codec codec)
+{
+    Internal::read(is, reinterpret_cast<char*>(buffer.data()), buffer.size(), codec);
+} // Internal::read
+
+/// @brief read compressed grid from stream
+/// @param is input stream to read from
+/// @param data data buffer to write into. Must be of size @c residual or larger.
+/// @param residual expected byte size of uncompressed data.
+/// @param codec mode of compression
+void Internal::read(std::istream& is, char* data, fileSize_t residual, Codec codec)
+{
+    // read tree using optional compression
+    switch (codec) {
+    case Codec::ZIP: {
+#ifdef NANOVDB_USE_ZIP
+        fileSize_t size;
+        is.read(reinterpret_cast<char*>(&size), sizeof(fileSize_t));
+        std::unique_ptr<Bytef[]> tmp(new Bytef[size]);// temp buffer for compressed data
+        is.read(reinterpret_cast<char*>(tmp.get()), size);
+        uLongf numBytes = static_cast<uLongf>(residual);
+        int status = uncompress(reinterpret_cast<Bytef*>(data), &numBytes, tmp.get(), static_cast<uLongf>(size));
+        if (status != Z_OK) std::runtime_error("Internal read error in ZIP");
+        if (fileSize_t(numBytes) != residual) throw std::runtime_error("UNZIP failed on byte size");
+#else
+        throw std::runtime_error("ZIP compression codec was disabled during build");
+#endif
+        break;
+    }
+    case Codec::BLOSC: {
+#ifdef NANOVDB_USE_BLOSC
+        do {
+            fileSize_t size;
+            is.read(reinterpret_cast<char*>(&size), sizeof(fileSize_t));
+            std::unique_ptr<char[]> tmp(new char[size]);// temp buffer for compressed data
+            is.read(reinterpret_cast<char*>(tmp.get()), size);
+            const fileSize_t chunk = residual < MAX_SIZE ? residual : MAX_SIZE;
+            const int        count = blosc_decompress_ctx(tmp.get(), data, size_t(chunk), 1); //fails with more threads :(
+            if (count < 1) std::runtime_error("Internal read error in BLOSC");
+            if (count != int(chunk)) throw std::runtime_error("BLOSC failed on byte size");
+            data += size_t(chunk);
+            residual -= chunk;
+        } while (residual > 0);
+#else
+        throw std::runtime_error("BLOSC compression codec was disabled during build");
+#endif
+        break;
+    }
+    default:
+        is.read(data, residual);// read uncompressed data
+    }
+    if (!is) throw std::runtime_error("Failed to read Tree from file");
+} // Internal::read
+
+// --------------------------> Implementations for FileGridMetaData <------------------------------------
+
+inline FileGridMetaData::FileGridMetaData(uint64_t size, Codec c, const GridData &gridData)
+    : FileMetaData{size, // gridSize
+                   size, // fileSize (will typically be redefined)
+                   0u, // nameKey
+                   0u, // voxelCount
+                   gridData.mGridType, // gridType
+                   gridData.mGridClass, // gridClass
+                   gridData.mWorldBBox, // worldBBox
+                   gridData.indexBBox(), // indexBBox
+                   gridData.mVoxelSize, // voxelSize
+                   0, // nameSize
+                   {0, 0, 0, 1}, // nodeCount[4]
+                   {0, 0, 0}, // tileCount[3]
+                   c, // codec
+                   0, // padding
+                   Version()}// version
+    , gridName(gridData.gridName())
+{
+    auto &treeData = *reinterpret_cast<const TreeData*>(gridData.treePtr());
+    nameKey = stringHash(gridName);
+    voxelCount = treeData.mVoxelCount;
+    nameSize = static_cast<uint32_t>(gridName.size() + 1); // include '\0'
+    for (int i = 0; i < 3; ++i) {
+        FileMetaData::nodeCount[i] = treeData.mNodeCount[i];
+        FileMetaData::tileCount[i] = treeData.mTileCount[i];
+    }
+}// FileGridMetaData::FileGridMetaData
+
+inline void FileGridMetaData::write(std::ostream& os) const
+{
+    os.write(reinterpret_cast<const char*>(this), sizeof(FileMetaData));
+    os.write(gridName.c_str(), nameSize);
+    if (!os) throw std::runtime_error("Failed writing FileGridMetaData");
+}// FileGridMetaData::write
+
+inline void FileGridMetaData::read(std::istream& is)
+{
+    is.read(reinterpret_cast<char*>(this), sizeof(FileMetaData));
+    std::unique_ptr<char[]> tmp(new char[nameSize]);
+    is.read(reinterpret_cast<char*>(tmp.get()), nameSize);
+    gridName.assign(tmp.get());
+    if (!is) throw std::runtime_error("Failed reading FileGridMetaData");
+}// FileGridMetaData::read
+
+// --------------------------> Implementations for Segment <------------------------------------
+
+inline uint64_t Segment::memUsage() const
+{
+    uint64_t sum = sizeof(FileHeader);
+    for (auto& m : meta) sum += m.memUsage();// includes FileMetaData + grid name
+    return sum;
+}// Segment::memUsage
+
+template<typename BufferT>
+inline void Segment::add(const GridHandle<BufferT>& h)
+{
+    for (uint32_t i = 0; i < h.gridCount(); ++i) {
+        const GridData *gridData = h.gridData(i);
+        if (!gridData) throw std::runtime_error("Segment::add: GridHandle does not contain grid #" + std::to_string(i));
+        meta.emplace_back(h.gridSize(i), header.codec, *gridData);
+    }
+    header.gridCount += h.gridCount();
+}// Segment::add
+
+inline void Segment::write(std::ostream& os) const
+{
+    if (header.gridCount == 0) {
+        throw std::runtime_error("Segment contains no grids");
+    } else if (!os.write(reinterpret_cast<const char*>(&header), sizeof(FileHeader))) {
+        throw std::runtime_error("Failed to write FileHeader of Segment");
+    }
+    for (auto& m : meta) m.write(os);
+}// Segment::write
+
+inline bool Segment::read(std::istream& is)
+{
+    is.read(reinterpret_cast<char*>(&header), sizeof(FileHeader));
+    if (is.eof()) {// The EOF flag is only set once a read tries to read past the end of the file
+        is.clear(std::ios_base::eofbit);// clear eof flag so we can rewind and read again
+        return false;
+    }
+    const MagicType magic = toMagic(header.magic);
+    if (magic != MagicType::NanoVDB && magic != MagicType::NanoFile) {
+        // first check for byte-swapped header magic.
+        if (header.magic == reverseEndianness(NANOVDB_MAGIC_NUMB) ||
+            header.magic == reverseEndianness(NANOVDB_MAGIC_FILE)) {
+            throw std::runtime_error("This nvdb file has reversed endianness");
+        } else {
+            if (magic == MagicType::OpenVDB) {
+                throw std::runtime_error("Expected a NanoVDB file, but read an OpenVDB file!");
+            } else if (magic == MagicType::NanoGrid) {
+                throw std::runtime_error("Expected a NanoVDB file, but read a raw NanoVDB grid!");
+            } else {
+                throw std::runtime_error("Expected a NanoVDB file, but read a file of unknown type!");
+            }
+        }
+    } else if ( !header.version.isCompatible()) {
+        std::stringstream ss;
+        Version v;
+        is.read(reinterpret_cast<char*>(&v), sizeof(Version));// read GridData::mVersion located at byte 16=sizeof(FileHeader) is stream
+        if ( v.getMajor() == NANOVDB_MAJOR_VERSION_NUMBER) {
+            ss << "This file looks like it contains a raw grid buffer and not a standard file with meta data";
+        } else if ( header.version.getMajor() < NANOVDB_MAJOR_VERSION_NUMBER) {
+            char str[30];
+            ss << "The file contains an older version of NanoVDB: " << std::string(toStr(str, header.version)) << "!\n\t"
+               << "Recommendation: Re-generate this NanoVDB file with this version: " << NANOVDB_MAJOR_VERSION_NUMBER << ".X of NanoVDB";
+        } else {
+            ss << "This tool was compiled against an older version of NanoVDB: " << NANOVDB_MAJOR_VERSION_NUMBER << ".X!\n\t"
+               << "Recommendation: Re-compile this tool against the newer version: " << header.version.getMajor() << ".X of NanoVDB";
+        }
+        throw std::runtime_error("An unrecoverable error in nanovdb::Segment::read:\n\tIncompatible file format: " + ss.str());
+    }
+    meta.resize(header.gridCount);
+    for (auto& m : meta) {
+        m.read(is);
+        m.version = header.version;
+    }
+    return true;
+}// Segment::read
+
+// --------------------------> writeGrid <------------------------------------
+
+template<typename BufferT>
+void writeGrid(std::ostream& os, const GridHandle<BufferT>& handle, Codec codec)
+{
+    Segment seg(codec);
+    seg.add(handle);
+    const auto start = os.tellp();
+    seg.write(os); // write header without the correct fileSize (so it's allocated)
+    for (uint32_t i = 0; i < handle.gridCount(); ++i) {
+        seg.meta[i].fileSize = Internal::write(os, handle, codec, i);
+    }
+    os.seekp(start);
+    seg.write(os);// re-write header with the correct fileSize
+    os.seekp(0, std::ios_base::end);// skip to end
+}// writeGrid
+
+template<typename BufferT>
+void writeGrid(const std::string& fileName, const GridHandle<BufferT>& handle, Codec codec, int verbose)
+{
+    std::ofstream os(fileName, std::ios::out | std::ios::binary | std::ios::trunc);
+    if (!os.is_open()) {
+        throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for output");
+    }
+    writeGrid<BufferT>(os, handle, codec);
+    if (verbose) {
+        std::cout << "Wrote nanovdb::Grid to file named \"" << fileName << "\"" << std::endl;
+    }
+}// writeGrid
+
+// --------------------------> writeGrids <------------------------------------
+
+template<typename BufferT = HostBuffer, template<typename...> class VecT = std::vector>
+void writeGrids(std::ostream& os, const VecT<GridHandle<BufferT>>& handles, Codec codec = Codec::NONE)
+{
+    for (auto& h : handles) writeGrid(os, h, codec);
+}// writeGrids
+
+template<typename BufferT, template<typename...> class VecT>
+void writeGrids(const std::string& fileName, const VecT<GridHandle<BufferT>>& handles, Codec codec, int verbose)
+{
+    std::ofstream os(fileName, std::ios::out | std::ios::binary | std::ios::trunc);
+    if (!os.is_open()) throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for output");
+    writeGrids<BufferT, VecT>(os, handles, codec);
+    if (verbose) std::cout << "Wrote " << handles.size() << " nanovdb::Grid(s) to file named \"" << fileName << "\"" << std::endl;
+}// writeGrids
+
+// --------------------------> readGrid <------------------------------------
+
+template<typename BufferT>
+GridHandle<BufferT> readGrid(std::istream& is, int n, const BufferT& pool)
+{
+    GridHandle<BufferT> handle;
+    if (n<0) {// read all grids into the same buffer
+        try {//first try to read a raw grid buffer
+            handle.read(is, pool);
+        } catch(const std::logic_error&) {
+            Segment seg;
+            uint64_t bufferSize = 0u;
+            uint32_t gridCount = 0u, gridIndex = 0u;
+            const auto start = is.tellg();
+            while (seg.read(is)) {
+                std::streamoff skipSize = 0;
+                for (auto& m : seg.meta) {
+                    ++gridCount;
+                    bufferSize += m.gridSize;
+                    skipSize   += m.fileSize;
+                }// loop over grids in segment
+                is.seekg(skipSize, std::ios_base::cur); // skip forward from the current position
+            }// loop over segments
+            auto buffer = BufferT::create(bufferSize, &pool);
+            char *ptr = (char*)buffer.data();
+            is.seekg(start);// rewind
+            while (seg.read(is)) {
+                for (auto& m : seg.meta) {
+                    Internal::read(is, ptr, m.gridSize, seg.header.codec);
+                    tools::updateGridCount((GridData*)ptr, gridIndex++, gridCount);
+                    ptr += m.gridSize;
+                }// loop over grids in segment
+            }// loop over segments
+            return GridHandle<BufferT>(std::move(buffer));
+        }
+    } else {// read a specific grid
+        try {//first try to read a raw grid buffer
+            handle.read(is, uint32_t(n), pool);
+            tools::updateGridCount((GridData*)handle.data(), 0u, 1u);
+        } catch(const std::logic_error&) {
+            Segment seg;
+            int counter = -1;
+            while (seg.read(is)) {
+                std::streamoff seek = 0;
+                for (auto& m : seg.meta) {
+                    if (++counter == n) {
+                        auto buffer = BufferT::create(m.gridSize, &pool);
+                        Internal::read(is, buffer, seg.header.codec);
+                        tools::updateGridCount((GridData*)buffer.data(), 0u, 1u);
+                        return GridHandle<BufferT>(std::move(buffer));
+                    } else {
+                        seek += m.fileSize;
+                    }
+                }// loop over grids in segment
+                is.seekg(seek, std::ios_base::cur); // skip forward from the current position
+            }// loop over segments
+            if (n != counter) throw std::runtime_error("stream does not contain a #" + std::to_string(n) + " grid");
+        }
+    }
+    return handle;
+}// readGrid
+
+/// @brief Read the n'th grid
+template<typename BufferT>
+GridHandle<BufferT> readGrid(const std::string& fileName, int n, int verbose, const BufferT& buffer)
+{
+    std::ifstream is(fileName, std::ios::in | std::ios::binary);
+    if (!is.is_open()) throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for input");
+    auto handle = readGrid<BufferT>(is, n, buffer);
+    if (verbose) {
+        if (n<0) {
+            std::cout << "Read all NanoGrids from the file named \"" << fileName << "\"" << std::endl;
+        } else {
+            std::cout << "Read NanoGrid # " << n << " from the file named \"" << fileName << "\"" << std::endl;
+        }
+    }
+    return handle; // is converted to r-value and return value is move constructed.
+}// readGrid
+
+/// @brief Read a specific grid from an input stream given the name of the grid
+/// @tparam BufferT Buffer type used for allocation
+/// @param is input stream from which to read the grid
+/// @param gridName string name of the (first) grid to be returned
+/// @param pool optional memory pool from which to allocate the grid buffer
+/// @return Return the first grid in the input stream with a specific name
+/// @throw std::runtime_error with no grid exists with the specified name
+template<typename BufferT>
+GridHandle<BufferT> readGrid(std::istream& is, const std::string& gridName, const BufferT& pool)
+{
+    try {
+        GridHandle<BufferT> handle;
+        handle.read(is, gridName, pool);
+        return handle;
+    } catch(const std::logic_error&) {
+        const auto key = stringHash(gridName);
+        Segment seg;
+        while (seg.read(is)) {// loop over all segments in stream
+            std::streamoff seek = 0;
+            for (auto& m : seg.meta) {// loop over all grids in segment
+                if ((m.nameKey == 0u || m.nameKey == key) && m.gridName == gridName) { // check for hash key collision
+                    auto buffer = BufferT::create(m.gridSize, &pool);
+                    is.seekg(seek, std::ios_base::cur); // rewind
+                    Internal::read(is, buffer, seg.header.codec);
+                    tools::updateGridCount((GridData*)buffer.data(), 0u, 1u);
+                    return GridHandle<BufferT>(std::move(buffer));
+                } else {
+                    seek += m.fileSize;
+                }
+            }
+            is.seekg(seek, std::ios_base::cur); // skip forward from the current position
+        }
+    }
+    throw std::runtime_error("Grid name '" + gridName + "' not found in file");
+}// readGrid
+
+/// @brief Read the first grid with a specific name
+template<typename BufferT>
+GridHandle<BufferT> readGrid(const std::string& fileName, const std::string& gridName, int verbose, const BufferT& buffer)
+{
+    std::ifstream is(fileName, std::ios::in | std::ios::binary);
+    if (!is.is_open()) throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for input");
+    auto handle = readGrid<BufferT>(is, gridName, buffer);
+    if (verbose) {
+        if (handle) {
+            std::cout << "Read NanoGrid named \"" << gridName << "\" from the file named \"" << fileName << "\"" << std::endl;
+        } else {
+            std::cout << "File named \"" << fileName << "\" does not contain a grid named \"" + gridName + "\"" << std::endl;
+        }
+    }
+    return handle; // is converted to r-value and return value is move constructed.
+}// readGrid
+
+// --------------------------> readGrids <------------------------------------
+
+template<typename BufferT = HostBuffer, template<typename...> class VecT = std::vector>
+VecT<GridHandle<BufferT>> readGrids(std::istream& is, const BufferT& pool = BufferT())
+{
+    VecT<GridHandle<BufferT>> handles;
+    try {//first try to read a raw grid buffer
+        GridHandle<BufferT> handle;
+        handle.read(is, pool);// will throw if stream does not contain a raw grid buffer
+        handles.push_back(std::move(handle)); // force move copy assignment
+    } catch(const std::logic_error&) {
+        Segment seg;
+        while (seg.read(is)) {
+            uint64_t bufferSize = 0;
+            for (auto& m : seg.meta) bufferSize += m.gridSize;
+            auto buffer = BufferT::create(bufferSize, &pool);
+            uint64_t bufferOffset = 0;
+            for (uint16_t i = 0; i < seg.header.gridCount; ++i) {
+                auto *data = util::PtrAdd<GridData>(buffer.data(), bufferOffset);
+                Internal::read(is, (char*)data, seg.meta[i].gridSize, seg.header.codec);
+                tools::updateGridCount(data, uint32_t(i), uint32_t(seg.header.gridCount));
+                bufferOffset += seg.meta[i].gridSize;
+            }// loop over grids in segment
+            handles.emplace_back(std::move(buffer)); // force move copy assignment
+        }// loop over segments
+    }
+    return handles; // is converted to r-value and return value is move constructed.
+}// readGrids
+
+/// @brief Read all the grids
+template<typename BufferT, template<typename...> class VecT>
+VecT<GridHandle<BufferT>> readGrids(const std::string& fileName, int verbose, const BufferT& buffer)
+{
+    std::ifstream is(fileName, std::ios::in | std::ios::binary);
+    if (!is.is_open()) throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for input");
+    auto handles = readGrids<BufferT, VecT>(is, buffer);
+    if (verbose) std::cout << "Read " << handles.size() << " NanoGrid(s) from the file named \"" << fileName << "\"" << std::endl;
+    return handles; // is converted to r-value and return value is move constructed.
+}// readGrids
+
+// --------------------------> readGridMetaData <------------------------------------
+
+inline std::vector<FileGridMetaData> readGridMetaData(const std::string& fileName)
+{
+    std::ifstream is(fileName, std::ios::in | std::ios::binary);
+    if (!is.is_open()) throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for input");
+    return readGridMetaData(is); // is converted to r-value and return value is move constructed.
+}// readGridMetaData
+
+inline std::vector<FileGridMetaData> readGridMetaData(std::istream& is)
+{
+    Segment seg;
+    std::vector<FileGridMetaData> meta;
+    try {
+        GridHandle<> handle;// if stream contains a raw grid buffer we unfortunately have to load everything
+        handle.read(is);
+        seg.add(handle);
+        meta = std::move(seg.meta);
+    } catch(const std::logic_error&) {
+        while (seg.read(is)) {
+            std::streamoff skip = 0;
+            for (auto& m : seg.meta) {
+                meta.push_back(m);
+                skip += m.fileSize;
+            }// loop over grid meta data in segment
+            is.seekg(skip, std::ios_base::cur);
+        }// loop over segments
+    }
+    return meta; // is converted to r-value and return value is move constructed.
+}// readGridMetaData
+
+// --------------------------> hasGrid <------------------------------------
+
+inline bool hasGrid(const std::string& fileName, const std::string& gridName)
+{
+    std::ifstream is(fileName, std::ios::in | std::ios::binary);
+    if (!is.is_open()) throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for input");
+    return hasGrid(is, gridName);
+}// hasGrid
+
+inline bool hasGrid(std::istream& is, const std::string& gridName)
+{
+    const auto key = stringHash(gridName);
+    Segment seg;
+    while (seg.read(is)) {
+        std::streamoff seek = 0;
+        for (auto& m : seg.meta) {
+            if (m.nameKey == key && m.gridName == gridName) return true; // check for hash key collision
+            seek += m.fileSize;
+        }// loop over grid meta data in segment
+        is.seekg(seek, std::ios_base::cur);
+    }// loop over segments
+    return false;
+}// hasGrid
+
+// --------------------------> stringHash <------------------------------------
+
+inline uint64_t stringHash(const char* c_str)
+{
+    uint64_t hash = 0;// zero is returned when cstr = nullptr or "\0"
+    if (c_str) {
+        for (auto* str = reinterpret_cast<const unsigned char*>(c_str); *str; ++str) {
+            uint64_t overflow = hash >> (64 - 8);
+            hash *= 67; // Next-ish prime after 26 + 26 + 10
+            hash += *str + overflow;
+        }
+    }
+    return hash;
+}// stringHash
+
+} // namespace io ======================================================================
+
+template<typename T>
+inline std::ostream&
+operator<<(std::ostream& os, const math::BBox<math::Vec3<T>>& b)
+{
+    os << "(" << b[0][0] << "," << b[0][1] << "," << b[0][2] << ") -> "
+       << "(" << b[1][0] << "," << b[1][1] << "," << b[1][2] << ")";
+    return os;
+}
+
+inline std::ostream&
+operator<<(std::ostream& os, const CoordBBox& b)
+{
+    os << "(" << b[0][0] << "," << b[0][1] << "," << b[0][2] << ") -> "
+       << "(" << b[1][0] << "," << b[1][1] << "," << b[1][2] << ")";
+    return os;
+}
+
+inline std::ostream&
+operator<<(std::ostream& os, const Coord& ijk)
+{
+    os << "(" << ijk[0] << "," << ijk[1] << "," << ijk[2] << ")";
+    return os;
+}
+
+template<typename T>
+inline std::ostream&
+operator<<(std::ostream& os, const math::Vec3<T>& v)
+{
+    os << "(" << v[0] << "," << v[1] << "," << v[2] << ")";
+    return os;
+}
+
+template<typename T>
+inline std::ostream&
+operator<<(std::ostream& os, const math::Vec4<T>& v)
+{
+    os << "(" << v[0] << "," << v[1] << "," << v[2] << "," << v[3] << ")";
+    return os;
+}
+
+} // namespace nanovdb ===================================================================
+
+#endif // NANOVDB_IO_H_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/util/CSampleFromVoxels.h b/nanovdb/nanovdb/math/CSampleFromVoxels.h
similarity index 100%
rename from nanovdb/nanovdb/util/CSampleFromVoxels.h
rename to nanovdb/nanovdb/math/CSampleFromVoxels.h
diff --git a/nanovdb/nanovdb/math/DitherLUT.h b/nanovdb/nanovdb/math/DitherLUT.h
new file mode 100644
index 0000000000..4a980e2aa1
--- /dev/null
+++ b/nanovdb/nanovdb/math/DitherLUT.h
@@ -0,0 +1,189 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+//
+/// @author Jeff Lait
+///
+/// @date  May 13, 2021
+///
+/// @file DitherLUT.h
+///
+/// @brief Defines look up table to do dithering of 8^3 leaf nodes.
+
+#ifndef NANOVDB_DITHERLUT_HAS_BEEN_INCLUDED
+#define NANOVDB_DITHERLUT_HAS_BEEN_INCLUDED
+
+#include <nanovdb/NanoVDB.h>// for __hostdev__, Vec3, Min, Max, Pow2, Pow3, Pow4
+
+namespace nanovdb {
+
+namespace math {
+
+class DitherLUT
+{
+    const bool mEnable;
+public:
+    /// @brief Constructor with an optional scaling factor for the dithering
+    __hostdev__ DitherLUT(bool enable = true) : mEnable(enable) {}
+
+    /// @brief Retrieves dither threshold for an offset within an 8^3 leaf nodes.
+    ///
+    /// @param offset into the lookup table of size 512
+    __hostdev__ float operator()(const int offset)
+    {
+
+// This table was generated with
+/**************
+
+static constexpr inline uint32
+SYSwang_inthash(uint32 key)
+{
+    // From http://www.concentric.net/~Ttwang/tech/inthash.htm
+    key += ~(key << 16);
+    key ^=  (key >> 5);
+    key +=  (key << 3);
+    key ^=  (key >> 13);
+    key += ~(key << 9);
+    key ^=  (key >> 17);
+    return key;
+}
+
+static void
+ut_initDitherR(float *pattern, float offset,
+    int x, int y, int z, int res, int goalres)
+{
+    // These offsets are designed to maximize the difference between
+    // dither values in nearby voxels within a given 2x2x2 cell, without
+    // producing axis-aligned artifacts.  The are organized in row-major
+    // order.
+    static const float  theDitherOffset[] = {0,4,6,2,5,1,3,7};
+    static const float  theScale = 0.125F;
+    int         key = (((z << res) + y) << res) + x;
+
+    if (res == goalres)
+    {
+    pattern[key] = offset;
+    return;
+    }
+
+    // Randomly flip (on each axis) the dithering patterns used by the
+    // subcells.  This key is xor'd with the subcell index below before
+    // looking up in the dither offset list.
+    key = SYSwang_inthash(key) & 7;
+
+    x <<= 1;
+    y <<= 1;
+    z <<= 1;
+
+    offset *= theScale;
+    for (int i = 0; i < 8; i++)
+    ut_initDitherR(pattern, offset+theDitherOffset[i ^ key]*theScale,
+        x+(i&1), y+((i&2)>>1), z+((i&4)>>2), res+1, goalres);
+}
+
+// This is a compact algorithm that accomplishes essentially the same thing
+// as ut_initDither() above.  We should eventually switch to use this and
+// clean the dead code.
+static fpreal32 *
+ut_initDitherRecursive(int goalres)
+{
+    const int nfloat = 1 << (goalres*3);
+    float   *pattern = new float[nfloat];
+    ut_initDitherR(pattern, 1.0F, 0, 0, 0, 0, goalres);
+
+    // This has built an even spacing from 1/nfloat to 1.0.
+    // however, our dither pattern should be 1/(nfloat+1) to nfloat/(nfloat+1)
+    // So we do a correction here.  Note that the earlier calculations are
+    // done with powers of 2 so are exact, so it does make sense to delay
+    // the renormalization to this pass.
+    float correctionterm = nfloat / (nfloat+1.0F);
+    for (int i = 0; i < nfloat; i++)
+        pattern[i] *= correctionterm;
+    return pattern;
+}
+
+    theDitherMatrix = ut_initDitherRecursive(3);
+
+    for (int i = 0; i < 512/8; i ++)
+    {
+        for (int j = 0; j < 8; j ++)
+            std::cout << theDitherMatrix[i*8+j] << "f, ";
+        std::cout << std::endl;
+    }
+
+ **************/
+        static const float LUT[512] =
+        {
+            0.14425f, 0.643275f, 0.830409f, 0.331384f, 0.105263f, 0.604289f, 0.167641f, 0.666667f,
+            0.892788f, 0.393762f, 0.0818713f, 0.580897f, 0.853801f, 0.354776f, 0.916179f, 0.417154f,
+            0.612086f, 0.11306f, 0.79922f, 0.300195f, 0.510721f, 0.0116959f, 0.947368f, 0.448343f,
+            0.362573f, 0.861598f, 0.0506823f, 0.549708f, 0.261209f, 0.760234f, 0.19883f, 0.697856f,
+            0.140351f, 0.639376f, 0.576998f, 0.0779727f, 0.522417f, 0.0233918f, 0.460039f, 0.959064f,
+            0.888889f, 0.389864f, 0.327485f, 0.826511f, 0.272904f, 0.77193f, 0.709552f, 0.210526f,
+            0.483431f, 0.982456f, 0.296296f, 0.795322f, 0.116959f, 0.615984f, 0.0545809f, 0.553606f,
+            0.732943f, 0.233918f, 0.545809f, 0.0467836f, 0.865497f, 0.366472f, 0.803119f, 0.304094f,
+            0.518519f, 0.0194932f, 0.45614f, 0.955166f, 0.729045f, 0.230019f, 0.54191f, 0.042885f,
+            0.269006f, 0.768031f, 0.705653f, 0.206628f, 0.479532f, 0.978558f, 0.292398f, 0.791423f,
+            0.237817f, 0.736842f, 0.424951f, 0.923977f, 0.136452f, 0.635478f, 0.323587f, 0.822612f,
+            0.986355f, 0.487329f, 0.674464f, 0.175439f, 0.88499f, 0.385965f, 0.573099f, 0.0740741f,
+            0.51462f, 0.0155945f, 0.202729f, 0.701754f, 0.148148f, 0.647174f, 0.834308f, 0.335283f,
+            0.265107f, 0.764133f, 0.951267f, 0.452242f, 0.896686f, 0.397661f, 0.08577f, 0.584795f,
+            0.8577f, 0.358674f, 0.920078f, 0.421053f, 0.740741f, 0.241715f, 0.678363f, 0.179337f,
+            0.109162f, 0.608187f, 0.17154f, 0.670565f, 0.491228f, 0.990253f, 0.42885f, 0.927875f,
+            0.0662768f, 0.565302f, 0.62768f, 0.128655f, 0.183236f, 0.682261f, 0.744639f, 0.245614f,
+            0.814815f, 0.315789f, 0.378168f, 0.877193f, 0.931774f, 0.432749f, 0.495127f, 0.994152f,
+            0.0350877f, 0.534113f, 0.97076f, 0.471735f, 0.214425f, 0.71345f, 0.526316f, 0.0272904f,
+            0.783626f, 0.2846f, 0.222222f, 0.721248f, 0.962963f, 0.463938f, 0.276803f, 0.775828f,
+            0.966862f, 0.467836f, 0.405458f, 0.904483f, 0.0701754f, 0.569201f, 0.881092f, 0.382066f,
+            0.218324f, 0.717349f, 0.654971f, 0.155945f, 0.818713f, 0.319688f, 0.132554f, 0.631579f,
+            0.0623782f, 0.561404f, 0.748538f, 0.249513f, 0.912281f, 0.413255f, 0.974659f, 0.475634f,
+            0.810916f, 0.311891f, 0.499025f, 0.998051f, 0.163743f, 0.662768f, 0.226121f, 0.725146f,
+            0.690058f, 0.191033f, 0.00389864f, 0.502924f, 0.557505f, 0.0584795f, 0.120858f, 0.619883f,
+            0.440546f, 0.939571f, 0.752437f, 0.253411f, 0.307992f, 0.807018f, 0.869396f, 0.37037f,
+            0.658869f, 0.159844f, 0.346979f, 0.846004f, 0.588694f, 0.0896686f, 0.152047f, 0.651072f,
+            0.409357f, 0.908382f, 0.596491f, 0.0974659f, 0.339181f, 0.838207f, 0.900585f, 0.401559f,
+            0.34308f, 0.842105f, 0.779727f, 0.280702f, 0.693957f, 0.194932f, 0.25731f, 0.756335f,
+            0.592593f, 0.0935673f, 0.0311891f, 0.530214f, 0.444444f, 0.94347f, 0.506823f, 0.00779727f,
+            0.68616f, 0.187135f, 0.124756f, 0.623782f, 0.288499f, 0.787524f, 0.350877f, 0.849903f,
+            0.436647f, 0.935673f, 0.873294f, 0.374269f, 0.538012f, 0.0389864f, 0.60039f, 0.101365f,
+            0.57115f, 0.0721248f, 0.758285f, 0.259259f, 0.719298f, 0.220273f, 0.532164f, 0.0331384f,
+            0.321637f, 0.820663f, 0.00974659f, 0.508772f, 0.469786f, 0.968811f, 0.282651f, 0.781676f,
+            0.539961f, 0.0409357f, 0.727096f, 0.22807f, 0.500975f, 0.00194932f, 0.563353f, 0.0643275f,
+            0.290448f, 0.789474f, 0.477583f, 0.976608f, 0.251462f, 0.750487f, 0.31384f, 0.812865f,
+            0.94152f, 0.442495f, 0.879142f, 0.380117f, 0.37232f, 0.871345f, 0.309942f, 0.808967f,
+            0.192982f, 0.692008f, 0.130604f, 0.62963f, 0.621832f, 0.122807f, 0.559454f, 0.0604289f,
+            0.660819f, 0.161793f, 0.723197f, 0.224172f, 0.403509f, 0.902534f, 0.840156f, 0.341131f,
+            0.411306f, 0.910331f, 0.473684f, 0.97271f, 0.653021f, 0.153996f, 0.0916179f, 0.590643f,
+            0.196881f, 0.695906f, 0.384016f, 0.883041f, 0.0955166f, 0.594542f, 0.157895f, 0.65692f,
+            0.945419f, 0.446394f, 0.633528f, 0.134503f, 0.844055f, 0.345029f, 0.906433f, 0.407407f,
+            0.165692f, 0.664717f, 0.103314f, 0.602339f, 0.126706f, 0.625731f, 0.189084f, 0.688109f,
+            0.91423f, 0.415205f, 0.851852f, 0.352827f, 0.875244f, 0.376218f, 0.937622f, 0.438596f,
+            0.317739f, 0.816764f, 0.255361f, 0.754386f, 0.996101f, 0.497076f, 0.933723f, 0.434698f,
+            0.567251f, 0.0682261f, 0.504873f, 0.00584795f, 0.247563f, 0.746589f, 0.185185f, 0.684211f,
+            0.037037f, 0.536062f, 0.0994152f, 0.598441f, 0.777778f, 0.278752f, 0.465887f, 0.964912f,
+            0.785575f, 0.28655f, 0.847953f, 0.348928f, 0.0292398f, 0.528265f, 0.7154f, 0.216374f,
+            0.39961f, 0.898636f, 0.961014f, 0.461988f, 0.0487329f, 0.547758f, 0.111111f, 0.610136f,
+            0.649123f, 0.150097f, 0.212476f, 0.711501f, 0.797271f, 0.298246f, 0.859649f, 0.360624f,
+            0.118908f, 0.617934f, 0.0565302f, 0.555556f, 0.329435f, 0.82846f, 0.516569f, 0.0175439f,
+            0.867446f, 0.368421f, 0.805068f, 0.306043f, 0.578947f, 0.079922f, 0.267057f, 0.766082f,
+            0.270955f, 0.76998f, 0.707602f, 0.208577f, 0.668616f, 0.169591f, 0.606238f, 0.107212f,
+            0.520468f, 0.0214425f, 0.45809f, 0.957115f, 0.419103f, 0.918129f, 0.356725f, 0.855751f,
+            0.988304f, 0.489279f, 0.426901f, 0.925926f, 0.450292f, 0.949318f, 0.512671f, 0.0136452f,
+            0.239766f, 0.738791f, 0.676413f, 0.177388f, 0.699805f, 0.20078f, 0.263158f, 0.762183f,
+            0.773879f, 0.274854f, 0.337232f, 0.836257f, 0.672515f, 0.173489f, 0.734893f, 0.235867f,
+            0.0253411f, 0.524366f, 0.586745f, 0.0877193f, 0.423002f, 0.922027f, 0.48538f, 0.984405f,
+            0.74269f, 0.243665f, 0.680312f, 0.181287f, 0.953216f, 0.454191f, 0.1423f, 0.641326f,
+            0.493177f, 0.992203f, 0.430799f, 0.929825f, 0.204678f, 0.703704f, 0.890838f, 0.391813f,
+            0.894737f, 0.395712f, 0.0838207f, 0.582846f, 0.0448343f, 0.54386f, 0.231969f, 0.730994f,
+            0.146199f, 0.645224f, 0.832359f, 0.333333f, 0.793372f, 0.294347f, 0.980507f, 0.481481f,
+            0.364522f, 0.863548f, 0.80117f, 0.302144f, 0.824561f, 0.325536f, 0.138402f, 0.637427f,
+            0.614035f, 0.11501f, 0.0526316f, 0.551657f, 0.0760234f, 0.575049f, 0.88694f, 0.387914f,
+        };
+        return mEnable ? LUT[offset & 511] : 0.5f;// branch prediction should optimize this!
+    }
+}; // DitherLUT class
+
+}// namspace math
+
+}// namespace nanovdb
+
+#endif // NANOVDB_DITHERLUT_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/math/HDDA.h b/nanovdb/nanovdb/math/HDDA.h
new file mode 100644
index 0000000000..420692c833
--- /dev/null
+++ b/nanovdb/nanovdb/math/HDDA.h
@@ -0,0 +1,510 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/// @file HDDA.h
+///
+/// @author Ken Museth
+///
+/// @brief Hierarchical Digital Differential Analyzers specialized for VDB.
+
+#ifndef NANOVDB_HDDA_H_HAS_BEEN_INCLUDED
+#define NANOVDB_HDDA_H_HAS_BEEN_INCLUDED
+
+// Comment out to disable this explicit round-off check
+#define ENFORCE_FORWARD_STEPPING
+
+#include <nanovdb/NanoVDB.h> // only dependency
+
+namespace nanovdb::math {
+
+/// @brief A Digital Differential Analyzer specialized for OpenVDB grids
+/// @note Conceptually similar to Bresenham's line algorithm applied
+/// to a 3D Ray intersecting OpenVDB nodes or voxels. Log2Dim = 0
+/// corresponds to a voxel and Log2Dim a tree node of size 2^Log2Dim.
+///
+/// @note The Ray template class is expected to have the following
+/// methods: test(time), t0(), t1(), invDir(), and  operator()(time).
+/// See the example Ray class above for their definition.
+template<typename RayT, typename CoordT = Coord>
+class HDDA
+{
+public:
+    using RealType = typename RayT::RealType;
+    using RealT = RealType;
+    using Vec3Type = typename RayT::Vec3Type;
+    using Vec3T = Vec3Type;
+    using CoordType = CoordT;
+
+    /// @brief Default ctor
+    HDDA() = default;
+
+    /// @brief ctor from ray and dimension at which the DDA marches
+    __hostdev__ HDDA(const RayT& ray, int dim) { this->init(ray, dim); }
+
+    /// @brief Re-initializes the HDDA
+    __hostdev__ void init(const RayT& ray, RealT startTime, RealT maxTime, int dim)
+    {
+        assert(startTime <= maxTime);
+        mDim = dim;
+        mT0 = startTime;
+        mT1 = maxTime;
+        const Vec3T &pos = ray(mT0), &dir = ray.dir(), &inv = ray.invDir();
+        mVoxel = RoundDown<CoordT>(pos) & (~(dim - 1));
+        for (int axis = 0; axis < 3; ++axis) {
+            if (dir[axis] == RealT(0)) { //handles dir = +/- 0
+                mNext[axis] = Maximum<RealT>::value(); //i.e. disabled!
+                mStep[axis] = 0;
+            } else if (inv[axis] > 0) {
+                mStep[axis] = 1;
+                mNext[axis] = mT0 + (mVoxel[axis] + dim - pos[axis]) * inv[axis];
+                mDelta[axis] = inv[axis];
+            } else {
+                mStep[axis] = -1;
+                mNext[axis] = mT0 + (mVoxel[axis] - pos[axis]) * inv[axis];
+                mDelta[axis] = -inv[axis];
+            }
+        }
+    }
+
+    /// @brief Simular to init above except it uses the bounds of the input ray
+    __hostdev__ void init(const RayT& ray, int dim) { this->init(ray, ray.t0(), ray.t1(), dim); }
+
+    /// @brief Updates the HDDA to march with the specified dimension
+    __hostdev__ bool update(const RayT& ray, int dim)
+    {
+        if (mDim == dim)
+            return false;
+        mDim = dim;
+        const Vec3T &pos = ray(mT0), &inv = ray.invDir();
+        mVoxel = RoundDown<CoordT>(pos) & (~(dim - 1));
+        for (int axis = 0; axis < 3; ++axis) {
+            if (mStep[axis] == 0)
+                continue;
+            mNext[axis] = mT0 + (mVoxel[axis] - pos[axis]) * inv[axis];
+            if (mStep[axis] > 0)
+                mNext[axis] += dim * inv[axis];
+        }
+
+        return true;
+    }
+
+    __hostdev__ int dim() const { return mDim; }
+
+    /// @brief Increment the voxel index to next intersected voxel or node
+    /// and returns true if the step in time does not exceed maxTime.
+    __hostdev__ bool step()
+    {
+        const int axis = MinIndex(mNext);
+#if 1
+        switch (axis) {
+        case 0:
+            return step<0>();
+        case 1:
+            return step<1>();
+        default:
+            return step<2>();
+        }
+#else
+        mT0 = mNext[axis];
+        mNext[axis] += mDim * mDelta[axis];
+        mVoxel[axis] += mDim * mStep[axis];
+        return mT0 <= mT1;
+#endif
+    }
+
+    /// @brief Return the index coordinates of the next node or voxel
+    /// intersected by the ray. If Log2Dim = 0 the return value is the
+    /// actual signed coordinate of the voxel, else it is the origin
+    /// of the corresponding VDB tree node or tile.
+    /// @note Incurs no computational overhead.
+    __hostdev__ const CoordT& voxel() const { return mVoxel; }
+
+    /// @brief Return the time (parameterized along the Ray) of the
+    /// first hit of a tree node of size 2^Log2Dim.
+    /// @details This value is initialized to startTime or ray.t0()
+    /// depending on the constructor used.
+    /// @note Incurs no computational overhead.
+    __hostdev__ RealType time() const { return mT0; }
+
+    /// @brief Return the maximum time (parameterized along the Ray).
+    __hostdev__ RealType maxTime() const { return mT1; }
+
+    /// @brief Return the time (parameterized along the Ray) of the
+    /// second (i.e. next) hit of a tree node of size 2^Log2Dim.
+    /// @note Incurs a (small) computational overhead.
+    __hostdev__ RealType next() const
+    {
+#if 1 //def __CUDA_ARCH__
+        return fminf(mT1, fminf(mNext[0], fminf(mNext[1], mNext[2])));
+#else
+        return std::min(mT1, std::min(mNext[0], std::min(mNext[1], mNext[2])));
+#endif
+    }
+
+private:
+    // helper to implement the general form
+    template<int axis>
+    __hostdev__ bool step()
+    {
+#ifdef ENFORCE_FORWARD_STEPPING
+        //if (mNext[axis] <= mT0) mNext[axis] += mT0 - mNext[axis] + fmaxf(mNext[axis]*1.0e-6f, 1.0e-6f);
+        //if (mNext[axis] <= mT0) mNext[axis] += mT0 - mNext[axis] + (mNext[axis] + 1.0f)*1.0e-6f;
+        if (mNext[axis] <= mT0) {
+            mNext[axis] += mT0 - 0.999999f * mNext[axis] + 1.0e-6f;
+        }
+#endif
+        mT0 = mNext[axis];
+        mNext[ axis] += mDim * mDelta[axis];
+        mVoxel[axis] += mDim * mStep[ axis];
+        return mT0 <= mT1;
+    }
+
+    int32_t mDim;
+    RealT   mT0, mT1; // min and max allowed times
+    CoordT  mVoxel, mStep; // current voxel location and step to next voxel location
+    Vec3T   mDelta, mNext; // delta time and next time
+}; // class HDDA
+
+/////////////////////////////////////////// ZeroCrossing ////////////////////////////////////////////
+
+/// @brief returns true if the ray intersects a zero-crossing at the voxel level of the grid in the accessor
+///        The empty-space ray-marching is performed at all levels of the tree using an
+///        HDDA. If an intersection is detected, then ijk is updated with the index coordinate of the closest
+///        voxel after the intersection point, v contains the grid values at ijk, and t is set to the time of
+///        the intersection along the ray.
+template<typename RayT, typename AccT>
+inline __hostdev__ bool ZeroCrossing(RayT& ray, AccT& acc, Coord& ijk, typename AccT::ValueType& v, float& t)
+{
+    if (!ray.clip(acc.root().bbox()) || ray.t1() > 1e20)
+        return false; // clip ray to bbox
+    static const float Delta = 1.0001f;
+    ijk = RoundDown<Coord>(ray.start()); // first hit of bbox
+    HDDA<RayT, Coord> hdda(ray, acc.getDim(ijk, ray));
+    const auto        v0 = acc.getValue(ijk);
+    while (hdda.step()) {
+        ijk = RoundDown<Coord>(ray(hdda.time() + Delta));
+        hdda.update(ray, acc.getDim(ijk, ray));
+        if (hdda.dim() > 1 || !acc.isActive(ijk))
+            continue; // either a tile value or an inactive voxel
+        while (hdda.step() && acc.isActive(hdda.voxel())) { // in the narrow band
+            v = acc.getValue(hdda.voxel());
+            if (v * v0 < 0) { // zero crossing
+                ijk = hdda.voxel();
+                t = hdda.time();
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+/////////////////////////////////////////// DDA ////////////////////////////////////////////
+
+/// @brief A Digital Differential Analyzer. Unlike HDDA (defined above) this DDA
+///        uses a fixed step-size defined by the template parameter Dim!
+///
+/// @note The Ray template class is expected to have the following
+/// methods: test(time), t0(), t1(), invDir(), and  operator()(time).
+/// See the example Ray class above for their definition.
+template<typename RayT, typename CoordT = Coord, int Dim = 1>
+class DDA
+{
+    static_assert(Dim >= 1, "Dim must be >= 1");
+
+public:
+    using RealType = typename RayT::RealType;
+    using RealT = RealType;
+    using Vec3Type = typename RayT::Vec3Type;
+    using Vec3T = Vec3Type;
+    using CoordType = CoordT;
+
+    /// @brief Default ctor
+    DDA() = default;
+
+    /// @brief ctor from ray and dimension at which the DDA marches
+    __hostdev__ DDA(const RayT& ray) { this->init(ray); }
+
+    /// @brief Re-initializes the DDA
+    __hostdev__ void init(const RayT& ray, RealT startTime, RealT maxTime)
+    {
+        assert(startTime <= maxTime);
+        mT0 = startTime;
+        mT1 = maxTime;
+        const Vec3T &pos = ray(mT0), &dir = ray.dir(), &inv = ray.invDir();
+        mVoxel = RoundDown<CoordT>(pos) & (~(Dim - 1));
+        for (int axis = 0; axis < 3; ++axis) {
+            if (dir[axis] == RealT(0)) { //handles dir = +/- 0
+                mNext[axis] = Maximum<RealT>::value(); //i.e. disabled!
+                mStep[axis] = 0;
+            } else if (inv[axis] > 0) {
+                mStep[axis] = Dim;
+                mNext[axis] = (mT0 + (mVoxel[axis] + Dim - pos[axis]) * inv[axis]);
+                mDelta[axis] = inv[axis];
+            } else {
+                mStep[axis] = -Dim;
+                mNext[axis] = mT0 + (mVoxel[axis] - pos[axis]) * inv[axis];
+                mDelta[axis] = -inv[axis];
+            }
+        }
+    }
+
+    /// @brief Simular to init above except it uses the bounds of the input ray
+    __hostdev__ void init(const RayT& ray) { this->init(ray, ray.t0(), ray.t1()); }
+
+    /// @brief Increment the voxel index to next intersected voxel or node
+    /// and returns true if the step in time does not exceed maxTime.
+    __hostdev__ bool step()
+    {
+        const int axis = MinIndex(mNext);
+#if 1
+        switch (axis) {
+        case 0:
+            return step<0>();
+        case 1:
+            return step<1>();
+        default:
+            return step<2>();
+        }
+#else
+#ifdef ENFORCE_FORWARD_STEPPING
+        if (mNext[axis] <= mT0) {
+            mNext[axis] += mT0 - 0.999999f * mNext[axis] + 1.0e-6f;
+        }
+#endif
+        mT0 = mNext[axis];
+        mNext[axis] += mDelta[axis];
+        mVoxel[axis] += mStep[axis];
+        return mT0 <= mT1;
+#endif
+    }
+
+    /// @brief Return the index coordinates of the next node or voxel
+    /// intersected by the ray. If Log2Dim = 0 the return value is the
+    /// actual signed coordinate of the voxel, else it is the origin
+    /// of the corresponding VDB tree node or tile.
+    /// @note Incurs no computational overhead.
+    __hostdev__ const CoordT& voxel() const { return mVoxel; }
+
+    /// @brief Return the time (parameterized along the Ray) of the
+    /// first hit of a tree node of size 2^Log2Dim.
+    /// @details This value is initialized to startTime or ray.t0()
+    /// depending on the constructor used.
+    /// @note Incurs no computational overhead.
+    __hostdev__ RealType time() const { return mT0; }
+
+    /// @brief Return the maximum time (parameterized along the Ray).
+    __hostdev__ RealType maxTime() const { return mT1; }
+
+    /// @brief Return the time (parameterized along the Ray) of the
+    /// second (i.e. next) hit of a tree node of size 2^Log2Dim.
+    /// @note Incurs a (small) computational overhead.
+    __hostdev__ RealType next() const
+    {
+        return Min(mT1, Min(mNext[0], Min(mNext[1], mNext[2])));
+    }
+
+    __hostdev__ int nextAxis() const
+    {
+        return nanovdb::math::MinIndex(mNext);
+    }
+
+private:
+    // helper to implement the general form
+    template<int axis>
+    __hostdev__ bool step()
+    {
+#ifdef ENFORCE_FORWARD_STEPPING
+        if (mNext[axis] <= mT0) {
+            mNext[axis] += mT0 - 0.999999f * mNext[axis] + 1.0e-6f;
+        }
+#endif
+        mT0 = mNext[axis];
+        mNext[axis] += mDelta[axis];
+        mVoxel[axis] += mStep[axis];
+        return mT0 <= mT1;
+    }
+
+    RealT  mT0, mT1; // min and max allowed times
+    CoordT mVoxel, mStep; // current voxel location and step to next voxel location
+    Vec3T  mDelta, mNext; // delta time and next time
+}; // class DDA
+
+/////////////////////////////////////////// ZeroCrossingNode ////////////////////////////////////////////
+
+template<typename RayT, typename NodeT>
+inline __hostdev__ bool ZeroCrossingNode(RayT& ray, const NodeT& node, float v0, nanovdb::math::Coord& ijk, float& v, float& t)
+{
+    math::BBox<Coord> bbox(node.origin(), node.origin() + Coord(node.dim() - 1));
+
+    if (!ray.clip(node.bbox())) {
+        return false;
+    }
+
+    const float t0 = ray.t0();
+
+    static const float Delta = 1.0001f;
+    ijk = Coord::Floor(ray(ray.t0() + Delta));
+
+    t = t0;
+    v = 0;
+
+    DDA<RayT, Coord, 1 << NodeT::LOG2DIM> dda(ray);
+    while (dda.step()) {
+        ijk = dda.voxel();
+
+        if (bbox.isInside(ijk) == false)
+            return false;
+
+        v = node.getValue(ijk);
+        if (v * v0 < 0) {
+            t = dda.time();
+            return true;
+        }
+    }
+    return false;
+}
+
+/////////////////////////////////////////// TreeMarcher ////////////////////////////////////////////
+
+/// @brief returns true if the ray intersects an active value at any level of the grid in the accessor.
+///        The empty-space ray-marching is performed at all levels of the tree using an
+///        HDDA. If an intersection is detected, then ijk is updated with the index coordinate of the first
+///        active voxel or tile, and t is set to the time of its intersection along the ray.
+template<typename RayT, typename AccT>
+inline __hostdev__ bool firstActive(RayT& ray, AccT& acc, Coord &ijk, float& t)
+{
+    if (!ray.clip(acc.root().bbox()) || ray.t1() > 1e20) {// clip ray to bbox
+        return false;// missed or undefined bbox
+    }
+    static const float Delta = 1.0001f;// forward step-size along the ray to avoid getting stuck
+    t = ray.t0();// initiate time
+    ijk = RoundDown<Coord>(ray.start()); // first voxel inside bbox
+    for (HDDA<RayT, Coord> hdda(ray, acc.getDim(ijk, ray)); !acc.isActive(ijk); hdda.update(ray, acc.getDim(ijk, ray))) {
+        if (!hdda.step()) return false;// leap-frog HDDA and exit if ray bound is exceeded
+        t = hdda.time() + Delta;// update time
+        ijk = RoundDown<Coord>( ray(t) );// update ijk
+    }
+    return true;
+}
+
+/////////////////////////////////////////// TreeMarcher ////////////////////////////////////////////
+
+/// @brief A Tree Marcher for Generic Grids
+
+template<typename NodeT, typename RayT, typename AccT, typename CoordT = Coord>
+class TreeMarcher
+{
+public:
+    using ChildT = typename NodeT::ChildNodeType;
+    using RealType = typename RayT::RealType;
+    using RealT = RealType;
+    using CoordType = CoordT;
+
+    inline __hostdev__ TreeMarcher(AccT& acc)
+        : mAcc(acc)
+    {
+    }
+
+    /// @brief Initialize the TreeMarcher with an index-space ray.
+    inline __hostdev__ bool init(const RayT& indexRay)
+    {
+        mRay = indexRay;
+        if (!mRay.clip(mAcc.root().bbox()))
+            return false; // clip ray to bbox
+
+        // tweak the intersection span into the bbox.
+        // CAVEAT: this will potentially clip some tiny corner intersections.
+        static const float Eps = 0.000001f;
+        const float        t0 = mRay.t0() + Eps;
+        const float        t1 = mRay.t1() - Eps;
+        if (t0 > t1)
+            return false;
+
+        const CoordT ijk = RoundDown<Coord>(mRay(t0));
+        const uint32_t    dim = mAcc.getDim(ijk, mRay);
+        mHdda.init(mRay, t0, t1, nanovdb::math::Max(dim, NodeT::dim()));
+
+        mT0 = (dim <= ChildT::dim()) ? mHdda.time() : -1; // potentially begin a span.
+        mTmax = t1;
+        return true;
+    }
+
+    /// @brief step the ray through the tree. If the ray hits a node then
+    /// populate t0 & t1, and the node.
+    /// @return true when a node of type NodeT is intersected, false otherwise.
+    inline __hostdev__ bool step(const NodeT** node, float& t0, float& t1)
+    {
+        // CAVEAT: if Delta is too large then it will clip corners of nodes in a visible way.
+        // but it has to be quite large when very far from the grid (due to fp32 rounding)
+        static const float Delta = 0.01f;
+        bool               hddaIsValid;
+
+        do {
+            t0 = mT0;
+
+            auto currentNode = mAcc.template getNode<NodeT>();
+
+            // get next node intersection...
+            hddaIsValid = mHdda.step();
+            const CoordT nextIjk = RoundDown<Coord>(mRay(mHdda.time() + Delta));
+            const auto   nextDim = mAcc.getDim(nextIjk, mRay);
+            mHdda.update(mRay, (int)Max(nextDim, NodeT::dim()));
+            mT0 = (nextDim <= ChildT::dim()) ? mHdda.time() : -1; // potentially begin a span.
+
+            if (t0 >= 0) { // we are in a span.
+                t1 = Min(mTmax, mHdda.time());
+
+                // TODO: clean this up!
+                if (t0 >= t1 || currentNode == nullptr)
+                    continue;
+
+                *node = currentNode;
+                return true;
+            }
+
+        } while (hddaIsValid);
+
+        return false;
+    }
+
+    inline __hostdev__ const RayT& ray() const { return mRay; }
+
+    inline __hostdev__ RayT& ray() { return mRay; }
+
+private:
+    AccT&             mAcc;
+    RayT              mRay;
+    HDDA<RayT, Coord> mHdda;
+    float             mT0;
+    float             mTmax;
+};// TreeMarcher
+
+/////////////////////////////////////////// PointTreeMarcher ////////////////////////////////////////////
+
+/// @brief A Tree Marcher for Point Grids
+///
+/// @note This class will handle correctly offseting the ray by 0.5 to ensure that
+/// the underlying HDDA will intersect with the grid-cells. See details below.
+
+template<typename AccT, typename RayT, typename CoordT = Coord>
+class PointTreeMarcher : public TreeMarcher<LeafNode<typename AccT::ValueType>, RayT, AccT, CoordT>
+{
+    using BaseT = TreeMarcher<LeafNode<typename AccT::ValueType>, RayT, AccT, CoordT>;
+public:
+    __hostdev__ PointTreeMarcher(AccT& acc) : BaseT(acc) {}
+
+    /// @brief Initiates this instance with a ray in index space.
+    ///
+    /// @details An offset by 0.5 is applied to the ray to account for the fact that points in vdb
+    ///          grids are bucketed into so-called grid cell, which are centered round grid voxels,
+    ///          whereas the DDA is based on so-called grid nodes, which are coincident with grid
+    ///          voxels. So, rather than offsettting the points by 0.5 to bring them into a grid
+    ///          node representation this method offsets the eye of the ray by 0.5, which effectively
+    ///          ensures that the DDA operates on grid cells as oppose to grid nodes. This subtle
+    ///          but important offset by 0.5 is explined in more details in our online documentation.
+    __hostdev__ bool init(RayT ray) { return BaseT::init(ray.offsetEye(0.5)); }
+};// PointTreeMarcher
+
+} // namespace nanovdb::math
+
+#endif // NANOVDB_HDDA_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/math/Math.h b/nanovdb/nanovdb/math/Math.h
new file mode 100644
index 0000000000..84100d499c
--- /dev/null
+++ b/nanovdb/nanovdb/math/Math.h
@@ -0,0 +1,1448 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    \file   Math.h
+
+    \author Ken Museth
+
+    \date  January 8, 2020
+
+    \brief Math functions and classes
+
+*/
+
+#ifndef NANOVDB_MATH_MATH_H_HAS_BEEN_INCLUDED
+#define NANOVDB_MATH_MATH_H_HAS_BEEN_INCLUDED
+
+#include <nanovdb/util/Util.h>// for __hostdev__ and lots of other utility functions
+
+namespace nanovdb {// =================================================================
+
+namespace math {// =============================================================
+
+// ----------------------------> Various math functions <-------------------------------------
+
+//@{
+/// @brief Pi constant taken from Boost to match old behaviour
+template<typename T>
+inline __hostdev__ constexpr T pi()
+{
+    return 3.141592653589793238462643383279502884e+00;
+}
+template<>
+inline __hostdev__ constexpr float pi()
+{
+    return 3.141592653589793238462643383279502884e+00F;
+}
+template<>
+inline __hostdev__ constexpr double pi()
+{
+    return 3.141592653589793238462643383279502884e+00;
+}
+template<>
+inline __hostdev__ constexpr long double pi()
+{
+    return 3.141592653589793238462643383279502884e+00L;
+}
+//@}
+
+//@{
+/// Tolerance for floating-point comparison
+template<typename T>
+struct Tolerance;
+template<>
+struct Tolerance<float>
+{
+    __hostdev__ static float value() { return 1e-8f; }
+};
+template<>
+struct Tolerance<double>
+{
+    __hostdev__ static double value() { return 1e-15; }
+};
+//@}
+
+//@{
+/// Delta for small floating-point offsets
+template<typename T>
+struct Delta;
+template<>
+struct Delta<float>
+{
+    __hostdev__ static float value() { return 1e-5f; }
+};
+template<>
+struct Delta<double>
+{
+    __hostdev__ static double value() { return 1e-9; }
+};
+//@}
+
+//@{
+/// Maximum floating-point values
+template<typename T>
+struct Maximum;
+#if defined(__CUDA_ARCH__) || defined(__HIP__)
+template<>
+struct Maximum<int>
+{
+    __hostdev__ static int value() { return 2147483647; }
+};
+template<>
+struct Maximum<uint32_t>
+{
+    __hostdev__ static uint32_t value() { return 4294967295u; }
+};
+template<>
+struct Maximum<float>
+{
+    __hostdev__ static float value() { return 1e+38f; }
+};
+template<>
+struct Maximum<double>
+{
+    __hostdev__ static double value() { return 1e+308; }
+};
+#else
+template<typename T>
+struct Maximum
+{
+    static T value() { return std::numeric_limits<T>::max(); }
+};
+#endif
+//@}
+
+template<typename Type>
+__hostdev__ inline bool isApproxZero(const Type& x)
+{
+    return !(x > Tolerance<Type>::value()) && !(x < -Tolerance<Type>::value());
+}
+
+template<typename Type>
+__hostdev__ inline Type Min(Type a, Type b)
+{
+    return (a < b) ? a : b;
+}
+__hostdev__ inline int32_t Min(int32_t a, int32_t b)
+{
+    return int32_t(fminf(float(a), float(b)));
+}
+__hostdev__ inline uint32_t Min(uint32_t a, uint32_t b)
+{
+    return uint32_t(fminf(float(a), float(b)));
+}
+__hostdev__ inline float Min(float a, float b)
+{
+    return fminf(a, b);
+}
+__hostdev__ inline double Min(double a, double b)
+{
+    return fmin(a, b);
+}
+template<typename Type>
+__hostdev__ inline Type Max(Type a, Type b)
+{
+    return (a > b) ? a : b;
+}
+
+__hostdev__ inline int32_t Max(int32_t a, int32_t b)
+{
+    return int32_t(fmaxf(float(a), float(b)));
+}
+__hostdev__ inline uint32_t Max(uint32_t a, uint32_t b)
+{
+    return uint32_t(fmaxf(float(a), float(b)));
+}
+__hostdev__ inline float Max(float a, float b)
+{
+    return fmaxf(a, b);
+}
+__hostdev__ inline double Max(double a, double b)
+{
+    return fmax(a, b);
+}
+__hostdev__ inline float Clamp(float x, float a, float b)
+{
+    return Max(Min(x, b), a);
+}
+__hostdev__ inline double Clamp(double x, double a, double b)
+{
+    return Max(Min(x, b), a);
+}
+
+__hostdev__ inline float Fract(float x)
+{
+    return x - floorf(x);
+}
+__hostdev__ inline double Fract(double x)
+{
+    return x - floor(x);
+}
+
+__hostdev__ inline int32_t Floor(float x)
+{
+    return int32_t(floorf(x));
+}
+__hostdev__ inline int32_t Floor(double x)
+{
+    return int32_t(floor(x));
+}
+
+__hostdev__ inline int32_t Ceil(float x)
+{
+    return int32_t(ceilf(x));
+}
+__hostdev__ inline int32_t Ceil(double x)
+{
+    return int32_t(ceil(x));
+}
+
+template<typename T>
+__hostdev__ inline T Pow2(T x)
+{
+    return x * x;
+}
+
+template<typename T>
+__hostdev__ inline T Pow3(T x)
+{
+    return x * x * x;
+}
+
+template<typename T>
+__hostdev__ inline T Pow4(T x)
+{
+    return Pow2(x * x);
+}
+template<typename T>
+__hostdev__ inline T Abs(T x)
+{
+    return x < 0 ? -x : x;
+}
+
+template<>
+__hostdev__ inline float Abs(float x)
+{
+    return fabsf(x);
+}
+
+template<>
+__hostdev__ inline double Abs(double x)
+{
+    return fabs(x);
+}
+
+template<>
+__hostdev__ inline int Abs(int x)
+{
+    return abs(x);
+}
+
+template<typename CoordT, typename RealT, template<typename> class Vec3T>
+__hostdev__ inline CoordT Round(const Vec3T<RealT>& xyz);
+
+template<typename CoordT, template<typename> class Vec3T>
+__hostdev__ inline CoordT Round(const Vec3T<float>& xyz)
+{
+    return CoordT(int32_t(rintf(xyz[0])), int32_t(rintf(xyz[1])), int32_t(rintf(xyz[2])));
+    //return CoordT(int32_t(roundf(xyz[0])), int32_t(roundf(xyz[1])), int32_t(roundf(xyz[2])) );
+    //return CoordT(int32_t(floorf(xyz[0] + 0.5f)), int32_t(floorf(xyz[1] + 0.5f)), int32_t(floorf(xyz[2] + 0.5f)));
+}
+
+template<typename CoordT, template<typename> class Vec3T>
+__hostdev__ inline CoordT Round(const Vec3T<double>& xyz)
+{
+    return CoordT(int32_t(floor(xyz[0] + 0.5)), int32_t(floor(xyz[1] + 0.5)), int32_t(floor(xyz[2] + 0.5)));
+}
+
+template<typename CoordT, typename RealT, template<typename> class Vec3T>
+__hostdev__ inline CoordT RoundDown(const Vec3T<RealT>& xyz)
+{
+    return CoordT(Floor(xyz[0]), Floor(xyz[1]), Floor(xyz[2]));
+}
+
+//@{
+/// Return the square root of a floating-point value.
+__hostdev__ inline float Sqrt(float x)
+{
+    return sqrtf(x);
+}
+__hostdev__ inline double Sqrt(double x)
+{
+    return sqrt(x);
+}
+//@}
+
+/// Return the sign of the given value as an integer (either -1, 0 or 1).
+template<typename T>
+__hostdev__ inline T Sign(const T& x)
+{
+    return ((T(0) < x) ? T(1) : T(0)) - ((x < T(0)) ? T(1) : T(0));
+}
+
+template<typename Vec3T>
+__hostdev__ inline int MinIndex(const Vec3T& v)
+{
+#if 0
+    static const int hashTable[8] = {2, 1, 9, 1, 2, 9, 0, 0}; //9 are dummy values
+    const int        hashKey = ((v[0] < v[1]) << 2) + ((v[0] < v[2]) << 1) + (v[1] < v[2]); // ?*4+?*2+?*1
+    return hashTable[hashKey];
+#else
+    if (v[0] < v[1] && v[0] < v[2])
+        return 0;
+    if (v[1] < v[2])
+        return 1;
+    else
+        return 2;
+#endif
+}
+
+template<typename Vec3T>
+__hostdev__ inline int MaxIndex(const Vec3T& v)
+{
+#if 0
+    static const int hashTable[8] = {2, 1, 9, 1, 2, 9, 0, 0}; //9 are dummy values
+    const int        hashKey = ((v[0] > v[1]) << 2) + ((v[0] > v[2]) << 1) + (v[1] > v[2]); // ?*4+?*2+?*1
+    return hashTable[hashKey];
+#else
+    if (v[0] > v[1] && v[0] > v[2])
+        return 0;
+    if (v[1] > v[2])
+        return 1;
+    else
+        return 2;
+#endif
+}
+
+/// @brief round up byteSize to the nearest wordSize, e.g. to align to machine word: AlignUp<sizeof(size_t)(n)
+///
+/// @details both wordSize and byteSize are in byte units
+template<uint64_t wordSize>
+__hostdev__ inline uint64_t AlignUp(uint64_t byteCount)
+{
+    const uint64_t r = byteCount % wordSize;
+    return r ? byteCount - r + wordSize : byteCount;
+}
+
+// ------------------------------> Coord <--------------------------------------
+
+// forward declaration so we can define Coord::asVec3s and Coord::asVec3d
+template<typename>
+class Vec3;
+
+/// @brief Signed (i, j, k) 32-bit integer coordinate class, similar to openvdb::math::Coord
+class Coord
+{
+    int32_t mVec[3]; // private member data - three signed index coordinates
+public:
+    using ValueType = int32_t;
+    using IndexType = uint32_t;
+
+    /// @brief Initialize all coordinates to zero.
+    __hostdev__ Coord()
+        : mVec{0, 0, 0}
+    {
+    }
+
+    /// @brief Initializes all coordinates to the given signed integer.
+    __hostdev__ explicit Coord(ValueType n)
+        : mVec{n, n, n}
+    {
+    }
+
+    /// @brief Initializes coordinate to the given signed integers.
+    __hostdev__ Coord(ValueType i, ValueType j, ValueType k)
+        : mVec{i, j, k}
+    {
+    }
+
+    __hostdev__ Coord(ValueType* ptr)
+        : mVec{ptr[0], ptr[1], ptr[2]}
+    {
+    }
+
+    __hostdev__ int32_t x() const { return mVec[0]; }
+    __hostdev__ int32_t y() const { return mVec[1]; }
+    __hostdev__ int32_t z() const { return mVec[2]; }
+
+    __hostdev__ int32_t& x() { return mVec[0]; }
+    __hostdev__ int32_t& y() { return mVec[1]; }
+    __hostdev__ int32_t& z() { return mVec[2]; }
+
+    __hostdev__ static Coord max() { return Coord(int32_t((1u << 31) - 1)); }
+
+    __hostdev__ static Coord min() { return Coord(-int32_t((1u << 31) - 1) - 1); }
+
+    __hostdev__ static size_t memUsage() { return sizeof(Coord); }
+
+    /// @brief Return a const reference to the given Coord component.
+    /// @warning The argument is assumed to be 0, 1, or 2.
+    __hostdev__ const ValueType& operator[](IndexType i) const { return mVec[i]; }
+
+    /// @brief Return a non-const reference to the given Coord component.
+    /// @warning The argument is assumed to be 0, 1, or 2.
+    __hostdev__ ValueType& operator[](IndexType i) { return mVec[i]; }
+
+    /// @brief Assignment operator that works with openvdb::Coord
+    template<typename CoordT>
+    __hostdev__ Coord& operator=(const CoordT& other)
+    {
+        static_assert(sizeof(Coord) == sizeof(CoordT), "Mis-matched sizeof");
+        mVec[0] = other[0];
+        mVec[1] = other[1];
+        mVec[2] = other[2];
+        return *this;
+    }
+
+    /// @brief Return a new instance with coordinates masked by the given unsigned integer.
+    __hostdev__ Coord operator&(IndexType n) const { return Coord(mVec[0] & n, mVec[1] & n, mVec[2] & n); }
+
+    // @brief Return a new instance with coordinates left-shifted by the given unsigned integer.
+    __hostdev__ Coord operator<<(IndexType n) const { return Coord(mVec[0] << n, mVec[1] << n, mVec[2] << n); }
+
+    // @brief Return a new instance with coordinates right-shifted by the given unsigned integer.
+    __hostdev__ Coord operator>>(IndexType n) const { return Coord(mVec[0] >> n, mVec[1] >> n, mVec[2] >> n); }
+
+    /// @brief Return true if this Coord is lexicographically less than the given Coord.
+    __hostdev__ bool operator<(const Coord& rhs) const
+    {
+        return mVec[0] < rhs[0] ? true
+             : mVec[0] > rhs[0] ? false
+             : mVec[1] < rhs[1] ? true
+             : mVec[1] > rhs[1] ? false
+             : mVec[2] < rhs[2] ? true : false;
+    }
+
+    /// @brief Return true if this Coord is lexicographically less or equal to the given Coord.
+    __hostdev__ bool operator<=(const Coord& rhs) const
+    {
+        return mVec[0] < rhs[0] ? true
+             : mVec[0] > rhs[0] ? false
+             : mVec[1] < rhs[1] ? true
+             : mVec[1] > rhs[1] ? false
+             : mVec[2] <=rhs[2] ? true : false;
+    }
+
+    // @brief Return true if this Coord is lexicographically greater than the given Coord.
+    __hostdev__ bool operator>(const Coord& rhs) const
+    {
+        return mVec[0] > rhs[0] ? true
+             : mVec[0] < rhs[0] ? false
+             : mVec[1] > rhs[1] ? true
+             : mVec[1] < rhs[1] ? false
+             : mVec[2] > rhs[2] ? true : false;
+    }
+
+    // @brief Return true if this Coord is lexicographically greater or equal to the given Coord.
+    __hostdev__ bool operator>=(const Coord& rhs) const
+    {
+        return mVec[0] > rhs[0] ? true
+             : mVec[0] < rhs[0] ? false
+             : mVec[1] > rhs[1] ? true
+             : mVec[1] < rhs[1] ? false
+             : mVec[2] >=rhs[2] ? true : false;
+    }
+
+    // @brief Return true if the Coord components are identical.
+    __hostdev__ bool   operator==(const Coord& rhs) const { return mVec[0] == rhs[0] && mVec[1] == rhs[1] && mVec[2] == rhs[2]; }
+    __hostdev__ bool   operator!=(const Coord& rhs) const { return mVec[0] != rhs[0] || mVec[1] != rhs[1] || mVec[2] != rhs[2]; }
+    __hostdev__ Coord& operator&=(int n)
+    {
+        mVec[0] &= n;
+        mVec[1] &= n;
+        mVec[2] &= n;
+        return *this;
+    }
+    __hostdev__ Coord& operator<<=(uint32_t n)
+    {
+        mVec[0] <<= n;
+        mVec[1] <<= n;
+        mVec[2] <<= n;
+        return *this;
+    }
+    __hostdev__ Coord& operator>>=(uint32_t n)
+    {
+        mVec[0] >>= n;
+        mVec[1] >>= n;
+        mVec[2] >>= n;
+        return *this;
+    }
+    __hostdev__ Coord& operator+=(int n)
+    {
+        mVec[0] += n;
+        mVec[1] += n;
+        mVec[2] += n;
+        return *this;
+    }
+    __hostdev__ Coord  operator+(const Coord& rhs) const { return Coord(mVec[0] + rhs[0], mVec[1] + rhs[1], mVec[2] + rhs[2]); }
+    __hostdev__ Coord  operator-(const Coord& rhs) const { return Coord(mVec[0] - rhs[0], mVec[1] - rhs[1], mVec[2] - rhs[2]); }
+    __hostdev__ Coord  operator-() const { return Coord(-mVec[0], -mVec[1], -mVec[2]); }
+    __hostdev__ Coord& operator+=(const Coord& rhs)
+    {
+        mVec[0] += rhs[0];
+        mVec[1] += rhs[1];
+        mVec[2] += rhs[2];
+        return *this;
+    }
+    __hostdev__ Coord& operator-=(const Coord& rhs)
+    {
+        mVec[0] -= rhs[0];
+        mVec[1] -= rhs[1];
+        mVec[2] -= rhs[2];
+        return *this;
+    }
+
+    /// @brief Perform a component-wise minimum with the other Coord.
+    __hostdev__ Coord& minComponent(const Coord& other)
+    {
+        if (other[0] < mVec[0])
+            mVec[0] = other[0];
+        if (other[1] < mVec[1])
+            mVec[1] = other[1];
+        if (other[2] < mVec[2])
+            mVec[2] = other[2];
+        return *this;
+    }
+
+    /// @brief Perform a component-wise maximum with the other Coord.
+    __hostdev__ Coord& maxComponent(const Coord& other)
+    {
+        if (other[0] > mVec[0])
+            mVec[0] = other[0];
+        if (other[1] > mVec[1])
+            mVec[1] = other[1];
+        if (other[2] > mVec[2])
+            mVec[2] = other[2];
+        return *this;
+    }
+#if defined(__CUDACC__) // the following functions only run on the GPU!
+    __device__ inline Coord& minComponentAtomic(const Coord& other)
+    {
+        atomicMin(&mVec[0], other[0]);
+        atomicMin(&mVec[1], other[1]);
+        atomicMin(&mVec[2], other[2]);
+        return *this;
+    }
+    __device__ inline Coord& maxComponentAtomic(const Coord& other)
+    {
+        atomicMax(&mVec[0], other[0]);
+        atomicMax(&mVec[1], other[1]);
+        atomicMax(&mVec[2], other[2]);
+        return *this;
+    }
+#endif
+
+    __hostdev__ Coord offsetBy(ValueType dx, ValueType dy, ValueType dz) const
+    {
+        return Coord(mVec[0] + dx, mVec[1] + dy, mVec[2] + dz);
+    }
+
+    __hostdev__ Coord offsetBy(ValueType n) const { return this->offsetBy(n, n, n); }
+
+    /// Return true if any of the components of @a a are smaller than the
+    /// corresponding components of @a b.
+    __hostdev__ static inline bool lessThan(const Coord& a, const Coord& b)
+    {
+        return (a[0] < b[0] || a[1] < b[1] || a[2] < b[2]);
+    }
+
+    /// @brief Return the largest integer coordinates that are not greater
+    /// than @a xyz (node centered conversion).
+    template<typename Vec3T>
+    __hostdev__ static Coord Floor(const Vec3T& xyz) { return Coord(math::Floor(xyz[0]), math::Floor(xyz[1]), math::Floor(xyz[2])); }
+
+    /// @brief Return a hash key derived from the existing coordinates.
+    /// @details The hash function is originally taken from the SIGGRAPH paper:
+    ///          "VDB: High-resolution sparse volumes with dynamic topology"
+    ///          and the prime numbers are modified based on the ACM Transactions on Graphics paper:
+    ///          "Real-time 3D reconstruction at scale using voxel hashing" (the second number had a typo!)
+    template<int Log2N = 3 + 4 + 5>
+    __hostdev__ uint32_t hash() const { return ((1 << Log2N) - 1) & (mVec[0] * 73856093 ^ mVec[1] * 19349669 ^ mVec[2] * 83492791); }
+
+    /// @brief Return the octant of this Coord
+    //__hostdev__ size_t octant() const { return (uint32_t(mVec[0])>>31) | ((uint32_t(mVec[1])>>31)<<1) | ((uint32_t(mVec[2])>>31)<<2); }
+    __hostdev__ uint8_t octant() const { return (uint8_t(bool(mVec[0] & (1u << 31)))) |
+                                                (uint8_t(bool(mVec[1] & (1u << 31))) << 1) |
+                                                (uint8_t(bool(mVec[2] & (1u << 31))) << 2); }
+
+    /// @brief Return a single precision floating-point vector of this coordinate
+    __hostdev__ inline Vec3<float> asVec3s() const;
+
+    /// @brief Return a double precision floating-point vector of this coordinate
+    __hostdev__ inline Vec3<double> asVec3d() const;
+
+    // returns a copy of itself, so it mimics the behaviour of Vec3<T>::round()
+    __hostdev__ inline Coord round() const { return *this; }
+}; // Coord class
+
+// ----------------------------> Vec3 <--------------------------------------
+
+/// @brief A simple vector class with three components, similar to openvdb::math::Vec3
+template<typename T>
+class Vec3
+{
+    T mVec[3];
+
+public:
+    static const int SIZE = 3;
+    static const int size = 3; // in openvdb::math::Tuple
+    using ValueType = T;
+    Vec3() = default;
+    __hostdev__ explicit Vec3(T x)
+        : mVec{x, x, x}
+    {
+    }
+    __hostdev__ Vec3(T x, T y, T z)
+        : mVec{x, y, z}
+    {
+    }
+    template<template<class> class Vec3T, class T2>
+    __hostdev__ Vec3(const Vec3T<T2>& v)
+        : mVec{T(v[0]), T(v[1]), T(v[2])}
+    {
+        static_assert(Vec3T<T2>::size == size, "expected Vec3T::size==3!");
+    }
+    template<typename T2>
+    __hostdev__ explicit Vec3(const Vec3<T2>& v)
+        : mVec{T(v[0]), T(v[1]), T(v[2])}
+    {
+    }
+    __hostdev__ explicit Vec3(const Coord& ijk)
+        : mVec{T(ijk[0]), T(ijk[1]), T(ijk[2])}
+    {
+    }
+    __hostdev__ bool operator==(const Vec3& rhs) const { return mVec[0] == rhs[0] && mVec[1] == rhs[1] && mVec[2] == rhs[2]; }
+    __hostdev__ bool operator!=(const Vec3& rhs) const { return mVec[0] != rhs[0] || mVec[1] != rhs[1] || mVec[2] != rhs[2]; }
+    template<template<class> class Vec3T, class T2>
+    __hostdev__ Vec3& operator=(const Vec3T<T2>& rhs)
+    {
+        static_assert(Vec3T<T2>::size == size, "expected Vec3T::size==3!");
+        mVec[0] = rhs[0];
+        mVec[1] = rhs[1];
+        mVec[2] = rhs[2];
+        return *this;
+    }
+    __hostdev__ const T& operator[](int i) const { return mVec[i]; }
+    __hostdev__ T&       operator[](int i) { return mVec[i]; }
+    template<typename Vec3T>
+    __hostdev__ T dot(const Vec3T& v) const { return mVec[0] * v[0] + mVec[1] * v[1] + mVec[2] * v[2]; }
+    template<typename Vec3T>
+    __hostdev__ Vec3 cross(const Vec3T& v) const
+    {
+        return Vec3(mVec[1] * v[2] - mVec[2] * v[1],
+                    mVec[2] * v[0] - mVec[0] * v[2],
+                    mVec[0] * v[1] - mVec[1] * v[0]);
+    }
+    __hostdev__ T lengthSqr() const
+    {
+        return mVec[0] * mVec[0] + mVec[1] * mVec[1] + mVec[2] * mVec[2]; // 5 flops
+    }
+    __hostdev__ T     length() const { return Sqrt(this->lengthSqr()); }
+    __hostdev__ Vec3  operator-() const { return Vec3(-mVec[0], -mVec[1], -mVec[2]); }
+    __hostdev__ Vec3  operator*(const Vec3& v) const { return Vec3(mVec[0] * v[0], mVec[1] * v[1], mVec[2] * v[2]); }
+    __hostdev__ Vec3  operator/(const Vec3& v) const { return Vec3(mVec[0] / v[0], mVec[1] / v[1], mVec[2] / v[2]); }
+    __hostdev__ Vec3  operator+(const Vec3& v) const { return Vec3(mVec[0] + v[0], mVec[1] + v[1], mVec[2] + v[2]); }
+    __hostdev__ Vec3  operator-(const Vec3& v) const { return Vec3(mVec[0] - v[0], mVec[1] - v[1], mVec[2] - v[2]); }
+    __hostdev__ Vec3  operator+(const Coord& ijk) const { return Vec3(mVec[0] + ijk[0], mVec[1] + ijk[1], mVec[2] + ijk[2]); }
+    __hostdev__ Vec3  operator-(const Coord& ijk) const { return Vec3(mVec[0] - ijk[0], mVec[1] - ijk[1], mVec[2] - ijk[2]); }
+    __hostdev__ Vec3  operator*(const T& s) const { return Vec3(s * mVec[0], s * mVec[1], s * mVec[2]); }
+    __hostdev__ Vec3  operator/(const T& s) const { return (T(1) / s) * (*this); }
+    __hostdev__ Vec3& operator+=(const Vec3& v)
+    {
+        mVec[0] += v[0];
+        mVec[1] += v[1];
+        mVec[2] += v[2];
+        return *this;
+    }
+    __hostdev__ Vec3& operator+=(const Coord& ijk)
+    {
+        mVec[0] += T(ijk[0]);
+        mVec[1] += T(ijk[1]);
+        mVec[2] += T(ijk[2]);
+        return *this;
+    }
+    __hostdev__ Vec3& operator-=(const Vec3& v)
+    {
+        mVec[0] -= v[0];
+        mVec[1] -= v[1];
+        mVec[2] -= v[2];
+        return *this;
+    }
+    __hostdev__ Vec3& operator-=(const Coord& ijk)
+    {
+        mVec[0] -= T(ijk[0]);
+        mVec[1] -= T(ijk[1]);
+        mVec[2] -= T(ijk[2]);
+        return *this;
+    }
+    __hostdev__ Vec3& operator*=(const T& s)
+    {
+        mVec[0] *= s;
+        mVec[1] *= s;
+        mVec[2] *= s;
+        return *this;
+    }
+    __hostdev__ Vec3& operator/=(const T& s) { return (*this) *= T(1) / s; }
+    __hostdev__ Vec3& normalize() { return (*this) /= this->length(); }
+    /// @brief Perform a component-wise minimum with the other Coord.
+    __hostdev__ Vec3& minComponent(const Vec3& other)
+    {
+        if (other[0] < mVec[0])
+            mVec[0] = other[0];
+        if (other[1] < mVec[1])
+            mVec[1] = other[1];
+        if (other[2] < mVec[2])
+            mVec[2] = other[2];
+        return *this;
+    }
+
+    /// @brief Perform a component-wise maximum with the other Coord.
+    __hostdev__ Vec3& maxComponent(const Vec3& other)
+    {
+        if (other[0] > mVec[0])
+            mVec[0] = other[0];
+        if (other[1] > mVec[1])
+            mVec[1] = other[1];
+        if (other[2] > mVec[2])
+            mVec[2] = other[2];
+        return *this;
+    }
+    /// @brief Return the smallest vector component
+    __hostdev__ ValueType min() const
+    {
+        return mVec[0] < mVec[1] ? (mVec[0] < mVec[2] ? mVec[0] : mVec[2]) : (mVec[1] < mVec[2] ? mVec[1] : mVec[2]);
+    }
+    /// @brief Return the largest vector component
+    __hostdev__ ValueType max() const
+    {
+        return mVec[0] > mVec[1] ? (mVec[0] > mVec[2] ? mVec[0] : mVec[2]) : (mVec[1] > mVec[2] ? mVec[1] : mVec[2]);
+    }
+    /// @brief Round each component if this Vec<T> up to its integer value
+    /// @return Return an integer Coord
+    __hostdev__ Coord floor() const { return Coord(Floor(mVec[0]), Floor(mVec[1]), Floor(mVec[2])); }
+    /// @brief Round each component if this Vec<T> down to its integer value
+    /// @return Return an integer Coord
+    __hostdev__ Coord ceil() const { return Coord(Ceil(mVec[0]), Ceil(mVec[1]), Ceil(mVec[2])); }
+    /// @brief Round each component if this Vec<T> to its closest integer value
+    /// @return Return an integer Coord
+    __hostdev__ Coord round() const
+    {
+        if constexpr(util::is_same<T, float>::value) {
+            return Coord(Floor(mVec[0] + 0.5f), Floor(mVec[1] + 0.5f), Floor(mVec[2] + 0.5f));
+        } else if constexpr(util::is_same<T, int>::value) {
+            return Coord(mVec[0], mVec[1], mVec[2]);
+        } else {
+            return Coord(Floor(mVec[0] + 0.5), Floor(mVec[1] + 0.5), Floor(mVec[2] + 0.5));
+        }
+    }
+
+    /// @brief return a non-const raw constant pointer to array of three vector components
+    __hostdev__ T* asPointer() { return mVec; }
+    /// @brief return a const raw constant pointer to array of three vector components
+    __hostdev__ const T* asPointer() const { return mVec; }
+}; // Vec3<T>
+
+template<typename T1, typename T2>
+__hostdev__ inline Vec3<T2> operator*(T1 scalar, const Vec3<T2>& vec)
+{
+    return Vec3<T2>(scalar * vec[0], scalar * vec[1], scalar * vec[2]);
+}
+template<typename T1, typename T2>
+__hostdev__ inline Vec3<T2> operator/(T1 scalar, const Vec3<T2>& vec)
+{
+    return Vec3<T2>(scalar / vec[0], scalar / vec[1], scalar / vec[2]);
+}
+
+/// @brief Return a single precision floating-point vector of this coordinate
+__hostdev__ inline Vec3<float> Coord::asVec3s() const
+{
+    return Vec3<float>(float(mVec[0]), float(mVec[1]), float(mVec[2]));
+}
+
+/// @brief Return a double precision floating-point vector of this coordinate
+__hostdev__ inline Vec3<double> Coord::asVec3d() const
+{
+    return Vec3<double>(double(mVec[0]), double(mVec[1]), double(mVec[2]));
+}
+
+// ----------------------------> Vec4 <--------------------------------------
+
+/// @brief A simple vector class with four components, similar to openvdb::math::Vec4
+template<typename T>
+class Vec4
+{
+    T mVec[4];
+
+public:
+    static const int SIZE = 4;
+    static const int size = 4;
+    using ValueType = T;
+    Vec4() = default;
+    __hostdev__ explicit Vec4(T x)
+        : mVec{x, x, x, x}
+    {
+    }
+    __hostdev__ Vec4(T x, T y, T z, T w)
+        : mVec{x, y, z, w}
+    {
+    }
+    template<typename T2>
+    __hostdev__ explicit Vec4(const Vec4<T2>& v)
+        : mVec{T(v[0]), T(v[1]), T(v[2]), T(v[3])}
+    {
+    }
+    template<template<class> class Vec4T, class T2>
+    __hostdev__ Vec4(const Vec4T<T2>& v)
+        : mVec{T(v[0]), T(v[1]), T(v[2]), T(v[3])}
+    {
+        static_assert(Vec4T<T2>::size == size, "expected Vec4T::size==4!");
+    }
+    __hostdev__ bool operator==(const Vec4& rhs) const { return mVec[0] == rhs[0] && mVec[1] == rhs[1] && mVec[2] == rhs[2] && mVec[3] == rhs[3]; }
+    __hostdev__ bool operator!=(const Vec4& rhs) const { return mVec[0] != rhs[0] || mVec[1] != rhs[1] || mVec[2] != rhs[2] || mVec[3] != rhs[3]; }
+    template<template<class> class Vec4T, class T2>
+    __hostdev__ Vec4& operator=(const Vec4T<T2>& rhs)
+    {
+        static_assert(Vec4T<T2>::size == size, "expected Vec4T::size==4!");
+        mVec[0] = rhs[0];
+        mVec[1] = rhs[1];
+        mVec[2] = rhs[2];
+        mVec[3] = rhs[3];
+        return *this;
+    }
+
+    __hostdev__ const T& operator[](int i) const { return mVec[i]; }
+    __hostdev__ T&       operator[](int i) { return mVec[i]; }
+    template<typename Vec4T>
+    __hostdev__ T dot(const Vec4T& v) const { return mVec[0] * v[0] + mVec[1] * v[1] + mVec[2] * v[2] + mVec[3] * v[3]; }
+    __hostdev__ T lengthSqr() const
+    {
+        return mVec[0] * mVec[0] + mVec[1] * mVec[1] + mVec[2] * mVec[2] + mVec[3] * mVec[3]; // 7 flops
+    }
+    __hostdev__ T     length() const { return Sqrt(this->lengthSqr()); }
+    __hostdev__ Vec4  operator-() const { return Vec4(-mVec[0], -mVec[1], -mVec[2], -mVec[3]); }
+    __hostdev__ Vec4  operator*(const Vec4& v) const { return Vec4(mVec[0] * v[0], mVec[1] * v[1], mVec[2] * v[2], mVec[3] * v[3]); }
+    __hostdev__ Vec4  operator/(const Vec4& v) const { return Vec4(mVec[0] / v[0], mVec[1] / v[1], mVec[2] / v[2], mVec[3] / v[3]); }
+    __hostdev__ Vec4  operator+(const Vec4& v) const { return Vec4(mVec[0] + v[0], mVec[1] + v[1], mVec[2] + v[2], mVec[3] + v[3]); }
+    __hostdev__ Vec4  operator-(const Vec4& v) const { return Vec4(mVec[0] - v[0], mVec[1] - v[1], mVec[2] - v[2], mVec[3] - v[3]); }
+    __hostdev__ Vec4  operator*(const T& s) const { return Vec4(s * mVec[0], s * mVec[1], s * mVec[2], s * mVec[3]); }
+    __hostdev__ Vec4  operator/(const T& s) const { return (T(1) / s) * (*this); }
+    __hostdev__ Vec4& operator+=(const Vec4& v)
+    {
+        mVec[0] += v[0];
+        mVec[1] += v[1];
+        mVec[2] += v[2];
+        mVec[3] += v[3];
+        return *this;
+    }
+    __hostdev__ Vec4& operator-=(const Vec4& v)
+    {
+        mVec[0] -= v[0];
+        mVec[1] -= v[1];
+        mVec[2] -= v[2];
+        mVec[3] -= v[3];
+        return *this;
+    }
+    __hostdev__ Vec4& operator*=(const T& s)
+    {
+        mVec[0] *= s;
+        mVec[1] *= s;
+        mVec[2] *= s;
+        mVec[3] *= s;
+        return *this;
+    }
+    __hostdev__ Vec4& operator/=(const T& s) { return (*this) *= T(1) / s; }
+    __hostdev__ Vec4& normalize() { return (*this) /= this->length(); }
+    /// @brief Perform a component-wise minimum with the other Coord.
+    __hostdev__ Vec4& minComponent(const Vec4& other)
+    {
+        if (other[0] < mVec[0])
+            mVec[0] = other[0];
+        if (other[1] < mVec[1])
+            mVec[1] = other[1];
+        if (other[2] < mVec[2])
+            mVec[2] = other[2];
+        if (other[3] < mVec[3])
+            mVec[3] = other[3];
+        return *this;
+    }
+
+    /// @brief Perform a component-wise maximum with the other Coord.
+    __hostdev__ Vec4& maxComponent(const Vec4& other)
+    {
+        if (other[0] > mVec[0])
+            mVec[0] = other[0];
+        if (other[1] > mVec[1])
+            mVec[1] = other[1];
+        if (other[2] > mVec[2])
+            mVec[2] = other[2];
+        if (other[3] > mVec[3])
+            mVec[3] = other[3];
+        return *this;
+    }
+}; // Vec4<T>
+
+template<typename T1, typename T2>
+__hostdev__ inline Vec4<T2> operator*(T1 scalar, const Vec4<T2>& vec)
+{
+    return Vec4<T2>(scalar * vec[0], scalar * vec[1], scalar * vec[2], scalar * vec[3]);
+}
+template<typename T1, typename T2>
+__hostdev__ inline Vec4<T2> operator/(T1 scalar, const Vec4<T2>& vec)
+{
+    return Vec4<T2>(scalar / vec[0], scalar / vec[1], scalar / vec[2], scalar / vec[3]);
+}
+
+// ----------------------------> matMult <--------------------------------------
+
+/// @brief Multiply a 3x3 matrix and a 3d vector using 32bit floating point arithmetics
+/// @note This corresponds to a linear mapping, e.g. scaling, rotation etc.
+/// @tparam Vec3T Template type of the input and output 3d vectors
+/// @param mat pointer to an array of floats with the 3x3 matrix
+/// @param xyz input vector to be multiplied by the matrix
+/// @return result of matrix-vector multiplication, i.e. mat x xyz
+template<typename Vec3T>
+__hostdev__ inline Vec3T matMult(const float* mat, const Vec3T& xyz)
+{
+    return Vec3T(fmaf(static_cast<float>(xyz[0]), mat[0], fmaf(static_cast<float>(xyz[1]), mat[1], static_cast<float>(xyz[2]) * mat[2])),
+                 fmaf(static_cast<float>(xyz[0]), mat[3], fmaf(static_cast<float>(xyz[1]), mat[4], static_cast<float>(xyz[2]) * mat[5])),
+                 fmaf(static_cast<float>(xyz[0]), mat[6], fmaf(static_cast<float>(xyz[1]), mat[7], static_cast<float>(xyz[2]) * mat[8]))); // 6 fmaf + 3 mult = 9 flops
+}
+
+/// @brief Multiply a 3x3 matrix and a 3d vector using 64bit floating point arithmetics
+/// @note This corresponds to a linear mapping, e.g. scaling, rotation etc.
+/// @tparam Vec3T Template type of the input and output 3d vectors
+/// @param mat pointer to an array of floats with the 3x3 matrix
+/// @param xyz input vector to be multiplied by the matrix
+/// @return result of matrix-vector multiplication, i.e. mat x xyz
+template<typename Vec3T>
+__hostdev__ inline Vec3T matMult(const double* mat, const Vec3T& xyz)
+{
+    return Vec3T(fma(static_cast<double>(xyz[0]), mat[0], fma(static_cast<double>(xyz[1]), mat[1], static_cast<double>(xyz[2]) * mat[2])),
+                 fma(static_cast<double>(xyz[0]), mat[3], fma(static_cast<double>(xyz[1]), mat[4], static_cast<double>(xyz[2]) * mat[5])),
+                 fma(static_cast<double>(xyz[0]), mat[6], fma(static_cast<double>(xyz[1]), mat[7], static_cast<double>(xyz[2]) * mat[8]))); // 6 fmaf + 3 mult = 9 flops
+}
+
+/// @brief Multiply a 3x3 matrix to a 3d vector and add another 3d vector using 32bit floating point arithmetics
+/// @note This corresponds to an affine transformation, i.e a linear mapping followed by a translation. e.g. scale/rotation and translation
+/// @tparam Vec3T Template type of the input and output 3d vectors
+/// @param mat pointer to an array of floats with the 3x3 matrix
+/// @param vec 3d vector to be added AFTER the matrix multiplication
+/// @param xyz input vector to be multiplied by the matrix and a translated by @c vec
+/// @return result of affine transformation, i.e. (mat x xyz) + vec
+template<typename Vec3T>
+__hostdev__ inline Vec3T matMult(const float* mat, const float* vec, const Vec3T& xyz)
+{
+    return Vec3T(fmaf(static_cast<float>(xyz[0]), mat[0], fmaf(static_cast<float>(xyz[1]), mat[1], fmaf(static_cast<float>(xyz[2]), mat[2], vec[0]))),
+                 fmaf(static_cast<float>(xyz[0]), mat[3], fmaf(static_cast<float>(xyz[1]), mat[4], fmaf(static_cast<float>(xyz[2]), mat[5], vec[1]))),
+                 fmaf(static_cast<float>(xyz[0]), mat[6], fmaf(static_cast<float>(xyz[1]), mat[7], fmaf(static_cast<float>(xyz[2]), mat[8], vec[2])))); // 9 fmaf = 9 flops
+}
+
+/// @brief Multiply a 3x3 matrix to a 3d vector and add another 3d vector using 64bit floating point arithmetics
+/// @note This corresponds to an affine transformation, i.e a linear mapping followed by a translation. e.g. scale/rotation and translation
+/// @tparam Vec3T Template type of the input and output 3d vectors
+/// @param mat pointer to an array of floats with the 3x3 matrix
+/// @param vec 3d vector to be added AFTER the matrix multiplication
+/// @param xyz input vector to be multiplied by the matrix and a translated by @c vec
+/// @return result of affine transformation, i.e. (mat x xyz) + vec
+template<typename Vec3T>
+__hostdev__ inline Vec3T matMult(const double* mat, const double* vec, const Vec3T& xyz)
+{
+    return Vec3T(fma(static_cast<double>(xyz[0]), mat[0], fma(static_cast<double>(xyz[1]), mat[1], fma(static_cast<double>(xyz[2]), mat[2], vec[0]))),
+                 fma(static_cast<double>(xyz[0]), mat[3], fma(static_cast<double>(xyz[1]), mat[4], fma(static_cast<double>(xyz[2]), mat[5], vec[1]))),
+                 fma(static_cast<double>(xyz[0]), mat[6], fma(static_cast<double>(xyz[1]), mat[7], fma(static_cast<double>(xyz[2]), mat[8], vec[2])))); // 9 fma = 9 flops
+}
+
+/// @brief Multiply the transposed of a 3x3 matrix and a 3d vector using 32bit floating point arithmetics
+/// @note This corresponds to an inverse linear mapping, e.g. inverse scaling, inverse rotation etc.
+/// @tparam Vec3T Template type of the input and output 3d vectors
+/// @param mat pointer to an array of floats with the 3x3 matrix
+/// @param xyz input vector to be multiplied by the transposed matrix
+/// @return result of matrix-vector multiplication, i.e. mat^T x xyz
+template<typename Vec3T>
+__hostdev__ inline Vec3T matMultT(const float* mat, const Vec3T& xyz)
+{
+    return Vec3T(fmaf(static_cast<float>(xyz[0]), mat[0], fmaf(static_cast<float>(xyz[1]), mat[3], static_cast<float>(xyz[2]) * mat[6])),
+                 fmaf(static_cast<float>(xyz[0]), mat[1], fmaf(static_cast<float>(xyz[1]), mat[4], static_cast<float>(xyz[2]) * mat[7])),
+                 fmaf(static_cast<float>(xyz[0]), mat[2], fmaf(static_cast<float>(xyz[1]), mat[5], static_cast<float>(xyz[2]) * mat[8]))); // 6 fmaf + 3 mult = 9 flops
+}
+
+/// @brief Multiply the transposed of a 3x3 matrix and a 3d vector using 64bit floating point arithmetics
+/// @note This corresponds to an inverse linear mapping, e.g. inverse scaling, inverse rotation etc.
+/// @tparam Vec3T Template type of the input and output 3d vectors
+/// @param mat pointer to an array of floats with the 3x3 matrix
+/// @param xyz input vector to be multiplied by the transposed matrix
+/// @return result of matrix-vector multiplication, i.e. mat^T x xyz
+template<typename Vec3T>
+__hostdev__ inline Vec3T matMultT(const double* mat, const Vec3T& xyz)
+{
+    return Vec3T(fma(static_cast<double>(xyz[0]), mat[0], fma(static_cast<double>(xyz[1]), mat[3], static_cast<double>(xyz[2]) * mat[6])),
+                 fma(static_cast<double>(xyz[0]), mat[1], fma(static_cast<double>(xyz[1]), mat[4], static_cast<double>(xyz[2]) * mat[7])),
+                 fma(static_cast<double>(xyz[0]), mat[2], fma(static_cast<double>(xyz[1]), mat[5], static_cast<double>(xyz[2]) * mat[8]))); // 6 fmaf + 3 mult = 9 flops
+}
+
+template<typename Vec3T>
+__hostdev__ inline Vec3T matMultT(const float* mat, const float* vec, const Vec3T& xyz)
+{
+    return Vec3T(fmaf(static_cast<float>(xyz[0]), mat[0], fmaf(static_cast<float>(xyz[1]), mat[3], fmaf(static_cast<float>(xyz[2]), mat[6], vec[0]))),
+                 fmaf(static_cast<float>(xyz[0]), mat[1], fmaf(static_cast<float>(xyz[1]), mat[4], fmaf(static_cast<float>(xyz[2]), mat[7], vec[1]))),
+                 fmaf(static_cast<float>(xyz[0]), mat[2], fmaf(static_cast<float>(xyz[1]), mat[5], fmaf(static_cast<float>(xyz[2]), mat[8], vec[2])))); // 9 fmaf = 9 flops
+}
+
+template<typename Vec3T>
+__hostdev__ inline Vec3T matMultT(const double* mat, const double* vec, const Vec3T& xyz)
+{
+    return Vec3T(fma(static_cast<double>(xyz[0]), mat[0], fma(static_cast<double>(xyz[1]), mat[3], fma(static_cast<double>(xyz[2]), mat[6], vec[0]))),
+                 fma(static_cast<double>(xyz[0]), mat[1], fma(static_cast<double>(xyz[1]), mat[4], fma(static_cast<double>(xyz[2]), mat[7], vec[1]))),
+                 fma(static_cast<double>(xyz[0]), mat[2], fma(static_cast<double>(xyz[1]), mat[5], fma(static_cast<double>(xyz[2]), mat[8], vec[2])))); // 9 fma = 9 flops
+}
+
+// ----------------------------> BBox <-------------------------------------
+
+// Base-class for static polymorphism (cannot be constructed directly)
+template<typename Vec3T>
+struct BaseBBox
+{
+    Vec3T                    mCoord[2];
+    __hostdev__ bool         operator==(const BaseBBox& rhs) const { return mCoord[0] == rhs.mCoord[0] && mCoord[1] == rhs.mCoord[1]; };
+    __hostdev__ bool         operator!=(const BaseBBox& rhs) const { return mCoord[0] != rhs.mCoord[0] || mCoord[1] != rhs.mCoord[1]; };
+    __hostdev__ const Vec3T& operator[](int i) const { return mCoord[i]; }
+    __hostdev__ Vec3T&       operator[](int i) { return mCoord[i]; }
+    __hostdev__ Vec3T&       min() { return mCoord[0]; }
+    __hostdev__ Vec3T&       max() { return mCoord[1]; }
+    __hostdev__ const Vec3T& min() const { return mCoord[0]; }
+    __hostdev__ const Vec3T& max() const { return mCoord[1]; }
+    __hostdev__ BaseBBox&    translate(const Vec3T& xyz)
+    {
+        mCoord[0] += xyz;
+        mCoord[1] += xyz;
+        return *this;
+    }
+    /// @brief Expand this bounding box to enclose point @c xyz.
+    __hostdev__ BaseBBox& expand(const Vec3T& xyz)
+    {
+        mCoord[0].minComponent(xyz);
+        mCoord[1].maxComponent(xyz);
+        return *this;
+    }
+
+    /// @brief Expand this bounding box to enclose the given bounding box.
+    __hostdev__ BaseBBox& expand(const BaseBBox& bbox)
+    {
+        mCoord[0].minComponent(bbox[0]);
+        mCoord[1].maxComponent(bbox[1]);
+        return *this;
+    }
+
+    /// @brief Intersect this bounding box with the given bounding box.
+    __hostdev__ BaseBBox& intersect(const BaseBBox& bbox)
+    {
+        mCoord[0].maxComponent(bbox[0]);
+        mCoord[1].minComponent(bbox[1]);
+        return *this;
+    }
+
+    //__hostdev__ BaseBBox expandBy(typename Vec3T::ValueType padding) const
+    //{
+    //    return BaseBBox(mCoord[0].offsetBy(-padding),mCoord[1].offsetBy(padding));
+    //}
+    __hostdev__ bool isInside(const Vec3T& xyz)
+    {
+        if (xyz[0] < mCoord[0][0] || xyz[1] < mCoord[0][1] || xyz[2] < mCoord[0][2])
+            return false;
+        if (xyz[0] > mCoord[1][0] || xyz[1] > mCoord[1][1] || xyz[2] > mCoord[1][2])
+            return false;
+        return true;
+    }
+
+protected:
+    __hostdev__ BaseBBox() {}
+    __hostdev__ BaseBBox(const Vec3T& min, const Vec3T& max)
+        : mCoord{min, max}
+    {
+    }
+}; // BaseBBox
+
+template<typename Vec3T, bool = util::is_floating_point<typename Vec3T::ValueType>::value>
+struct BBox;
+
+/// @brief Partial template specialization for floating point coordinate types.
+///
+/// @note Min is inclusive and max is exclusive. If min = max the dimension of
+///       the bounding box is zero and therefore it is also empty.
+template<typename Vec3T>
+struct BBox<Vec3T, true> : public BaseBBox<Vec3T>
+{
+    using Vec3Type = Vec3T;
+    using ValueType = typename Vec3T::ValueType;
+    static_assert(util::is_floating_point<ValueType>::value, "Expected a floating point coordinate type");
+    using BaseT = BaseBBox<Vec3T>;
+    using BaseT::mCoord;
+    /// @brief Default construction sets BBox to an empty bbox
+    __hostdev__ BBox()
+        : BaseT(Vec3T( Maximum<typename Vec3T::ValueType>::value()),
+                Vec3T(-Maximum<typename Vec3T::ValueType>::value()))
+    {
+    }
+    __hostdev__ BBox(const Vec3T& min, const Vec3T& max)
+        : BaseT(min, max)
+    {
+    }
+    __hostdev__ BBox(const Coord& min, const Coord& max)
+        : BaseT(Vec3T(ValueType(min[0]), ValueType(min[1]), ValueType(min[2])),
+                Vec3T(ValueType(max[0] + 1), ValueType(max[1] + 1), ValueType(max[2] + 1)))
+    {
+    }
+    __hostdev__ static BBox createCube(const Coord& min, typename Coord::ValueType dim)
+    {
+        return BBox(min, min.offsetBy(dim));
+    }
+
+    __hostdev__ BBox(const BaseBBox<Coord>& bbox)
+        : BBox(bbox[0], bbox[1])
+    {
+    }
+    __hostdev__ bool  empty() const { return mCoord[0][0] >= mCoord[1][0] ||
+                                             mCoord[0][1] >= mCoord[1][1] ||
+                                             mCoord[0][2] >= mCoord[1][2]; }
+    __hostdev__ operator bool() const { return mCoord[0][0] < mCoord[1][0] &&
+                                               mCoord[0][1] < mCoord[1][1] &&
+                                               mCoord[0][2] < mCoord[1][2]; }
+    __hostdev__ Vec3T dim() const { return *this ? this->max() - this->min() : Vec3T(0); }
+    __hostdev__ bool  isInside(const Vec3T& p) const
+    {
+        return p[0] > mCoord[0][0] && p[1] > mCoord[0][1] && p[2] > mCoord[0][2] &&
+               p[0] < mCoord[1][0] && p[1] < mCoord[1][1] && p[2] < mCoord[1][2];
+    }
+
+}; // BBox<Vec3T, true>
+
+/// @brief Partial template specialization for integer coordinate types
+///
+/// @note Both min and max are INCLUDED in the bbox so dim = max - min + 1. So,
+///       if min = max the bounding box contains exactly one point and dim = 1!
+template<typename CoordT>
+struct BBox<CoordT, false> : public BaseBBox<CoordT>
+{
+    static_assert(util::is_same<int, typename CoordT::ValueType>::value, "Expected \"int\" coordinate type");
+    using BaseT = BaseBBox<CoordT>;
+    using BaseT::mCoord;
+    /// @brief Iterator over the domain covered by a BBox
+    /// @details z is the fastest-moving coordinate.
+    class Iterator
+    {
+        const BBox& mBBox;
+        CoordT      mPos;
+
+    public:
+        __hostdev__ Iterator(const BBox& b)
+            : mBBox(b)
+            , mPos(b.min())
+        {
+        }
+        __hostdev__ Iterator(const BBox& b, const Coord& p)
+            : mBBox(b)
+            , mPos(p)
+        {
+        }
+        __hostdev__ Iterator& operator++()
+        {
+            if (mPos[2] < mBBox[1][2]) { // this is the most common case
+                ++mPos[2];// increment z
+            } else if (mPos[1] < mBBox[1][1]) {
+                mPos[2] = mBBox[0][2];// reset z
+                ++mPos[1];// increment y
+            } else if (mPos[0] <= mBBox[1][0]) {
+                mPos[2] = mBBox[0][2];// reset z
+                mPos[1] = mBBox[0][1];// reset y
+                ++mPos[0];// increment x
+            }
+            return *this;
+        }
+        __hostdev__ Iterator operator++(int)
+        {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+        __hostdev__ bool operator==(const Iterator& rhs) const
+        {
+            NANOVDB_ASSERT(mBBox == rhs.mBBox);
+            return mPos == rhs.mPos;
+        }
+        __hostdev__ bool operator!=(const Iterator& rhs) const
+        {
+            NANOVDB_ASSERT(mBBox == rhs.mBBox);
+            return mPos != rhs.mPos;
+        }
+        __hostdev__ bool operator<(const Iterator& rhs) const
+        {
+            NANOVDB_ASSERT(mBBox == rhs.mBBox);
+            return mPos < rhs.mPos;
+        }
+        __hostdev__ bool operator<=(const Iterator& rhs) const
+        {
+            NANOVDB_ASSERT(mBBox == rhs.mBBox);
+            return mPos <= rhs.mPos;
+        }
+        /// @brief Return @c true if the iterator still points to a valid coordinate.
+        __hostdev__ operator bool() const { return mPos <= mBBox[1]; }
+        __hostdev__ const CoordT& operator*() const { return mPos; }
+    }; // Iterator
+    __hostdev__ Iterator begin() const { return Iterator{*this}; }
+    __hostdev__ Iterator end()   const { return Iterator{*this, CoordT(mCoord[1][0]+1, mCoord[0][1], mCoord[0][2])}; }
+    __hostdev__          BBox()
+        : BaseT(CoordT::max(), CoordT::min())
+    {
+    }
+    __hostdev__ BBox(const CoordT& min, const CoordT& max)
+        : BaseT(min, max)
+    {
+    }
+
+    template<typename SplitT>
+    __hostdev__ BBox(BBox& other, const SplitT&)
+        : BaseT(other.mCoord[0], other.mCoord[1])
+    {
+        NANOVDB_ASSERT(this->is_divisible());
+        const int n = MaxIndex(this->dim());
+        mCoord[1][n] = (mCoord[0][n] + mCoord[1][n]) >> 1;
+        other.mCoord[0][n] = mCoord[1][n] + 1;
+    }
+
+    __hostdev__ static BBox createCube(const CoordT& min, typename CoordT::ValueType dim)
+    {
+        return BBox(min, min.offsetBy(dim - 1));
+    }
+
+    __hostdev__ static BBox createCube(typename CoordT::ValueType min, typename CoordT::ValueType max)
+    {
+        return BBox(CoordT(min), CoordT(max));
+    }
+
+    __hostdev__ bool is_divisible() const { return mCoord[0][0] < mCoord[1][0] &&
+                                                   mCoord[0][1] < mCoord[1][1] &&
+                                                   mCoord[0][2] < mCoord[1][2]; }
+    /// @brief Return true if this bounding box is empty, e.g. uninitialized
+    __hostdev__ bool     empty() const { return mCoord[0][0] > mCoord[1][0] ||
+                                                mCoord[0][1] > mCoord[1][1] ||
+                                                mCoord[0][2] > mCoord[1][2]; }
+    /// @brief Convert this BBox to boolean true if it is not empty
+    __hostdev__ operator bool() const { return mCoord[0][0] <= mCoord[1][0] &&
+                                               mCoord[0][1] <= mCoord[1][1] &&
+                                               mCoord[0][2] <= mCoord[1][2]; }
+    __hostdev__ CoordT   dim() const { return *this ? this->max() - this->min() + Coord(1) : Coord(0); }
+    __hostdev__ uint64_t volume() const
+    {
+        auto d = this->dim();
+        return uint64_t(d[0]) * uint64_t(d[1]) * uint64_t(d[2]);
+    }
+    __hostdev__ bool isInside(const CoordT& p) const { return !(CoordT::lessThan(p, this->min()) || CoordT::lessThan(this->max(), p)); }
+    /// @brief Return @c true if the given bounding box is inside this bounding box.
+    __hostdev__ bool isInside(const BBox& b) const
+    {
+        return !(CoordT::lessThan(b.min(), this->min()) || CoordT::lessThan(this->max(), b.max()));
+    }
+
+    /// @brief Return @c true if the given bounding box overlaps with this bounding box.
+    __hostdev__ bool hasOverlap(const BBox& b) const
+    {
+        return !(CoordT::lessThan(this->max(), b.min()) || CoordT::lessThan(b.max(), this->min()));
+    }
+
+    /// @warning This converts a CoordBBox into a floating-point bounding box which implies that max += 1 !
+    template<typename RealT = double>
+    __hostdev__ BBox<Vec3<RealT>> asReal() const
+    {
+        static_assert(util::is_floating_point<RealT>::value, "CoordBBox::asReal: Expected a floating point coordinate");
+        return BBox<Vec3<RealT>>(Vec3<RealT>(RealT(mCoord[0][0]), RealT(mCoord[0][1]), RealT(mCoord[0][2])),
+                                 Vec3<RealT>(RealT(mCoord[1][0] + 1), RealT(mCoord[1][1] + 1), RealT(mCoord[1][2] + 1)));
+    }
+    /// @brief Return a new instance that is expanded by the specified padding.
+    __hostdev__ BBox expandBy(typename CoordT::ValueType padding) const
+    {
+        return BBox(mCoord[0].offsetBy(-padding), mCoord[1].offsetBy(padding));
+    }
+
+    /// @brief  @brief transform this coordinate bounding box by the specified map
+    /// @param map mapping of index to world coordinates
+    /// @return world bounding box
+    template<typename Map>
+    __hostdev__ auto transform(const Map& map) const
+    {
+        using Vec3T = Vec3<double>;
+        const Vec3T tmp = map.applyMap(Vec3T(mCoord[0][0], mCoord[0][1], mCoord[0][2]));
+        BBox<Vec3T> bbox(tmp, tmp);// return value
+        bbox.expand(map.applyMap(Vec3T(mCoord[0][0], mCoord[0][1], mCoord[1][2])));
+        bbox.expand(map.applyMap(Vec3T(mCoord[0][0], mCoord[1][1], mCoord[0][2])));
+        bbox.expand(map.applyMap(Vec3T(mCoord[1][0], mCoord[0][1], mCoord[0][2])));
+        bbox.expand(map.applyMap(Vec3T(mCoord[1][0], mCoord[1][1], mCoord[0][2])));
+        bbox.expand(map.applyMap(Vec3T(mCoord[1][0], mCoord[0][1], mCoord[1][2])));
+        bbox.expand(map.applyMap(Vec3T(mCoord[0][0], mCoord[1][1], mCoord[1][2])));
+        bbox.expand(map.applyMap(Vec3T(mCoord[1][0], mCoord[1][1], mCoord[1][2])));
+        return bbox;
+    }
+
+#if defined(__CUDACC__) // the following functions only run on the GPU!
+    __device__ inline BBox& expandAtomic(const CoordT& ijk)
+    {
+        mCoord[0].minComponentAtomic(ijk);
+        mCoord[1].maxComponentAtomic(ijk);
+        return *this;
+    }
+    __device__ inline BBox& expandAtomic(const BBox& bbox)
+    {
+        mCoord[0].minComponentAtomic(bbox[0]);
+        mCoord[1].maxComponentAtomic(bbox[1]);
+        return *this;
+    }
+    __device__ inline BBox& intersectAtomic(const BBox& bbox)
+    {
+        mCoord[0].maxComponentAtomic(bbox[0]);
+        mCoord[1].minComponentAtomic(bbox[1]);
+        return *this;
+    }
+#endif
+}; // BBox<CoordT, false>
+
+// --------------------------> Rgba8 <------------------------------------
+
+/// @brief 8-bit red, green, blue, alpha packed into 32 bit unsigned int
+class Rgba8
+{
+    union
+    {
+        uint8_t  c[4];   // 4 integer color channels of red, green, blue and alpha components.
+        uint32_t packed; // 32 bit packed representation
+    } mData;
+
+public:
+    static const int SIZE = 4;
+    using ValueType = uint8_t;
+
+    /// @brief Default copy constructor
+    Rgba8(const Rgba8&) = default;
+
+    /// @brief Default move constructor
+    Rgba8(Rgba8&&) = default;
+
+    /// @brief Default move assignment operator
+    /// @return non-const reference to this instance
+    Rgba8&      operator=(Rgba8&&) = default;
+
+    /// @brief Default copy assignment operator
+    /// @return non-const reference to this instance
+    Rgba8&      operator=(const Rgba8&) = default;
+
+    /// @brief Default ctor initializes all channels to zero
+    __hostdev__ Rgba8()
+        : mData{{0, 0, 0, 0}}
+    {
+        static_assert(sizeof(uint32_t) == sizeof(Rgba8), "Unexpected sizeof");
+    }
+
+    /// @brief integer r,g,b,a ctor where alpha channel defaults to opaque
+    /// @note all values should be in the range 0u to 255u
+    __hostdev__ Rgba8(uint8_t r, uint8_t g, uint8_t b, uint8_t a = 255u)
+        : mData{{r, g, b, a}}
+    {
+    }
+
+    /// @brief  @brief ctor where all channels are initialized to the same value
+    /// @note value should be in the range 0u to 255u
+    explicit __hostdev__ Rgba8(uint8_t v)
+        : mData{{v, v, v, v}}
+    {
+    }
+
+    /// @brief floating-point r,g,b,a ctor where alpha channel defaults to opaque
+    /// @note all values should be in the range 0.0f to 1.0f
+    __hostdev__ Rgba8(float r, float g, float b, float a = 1.0f)
+        : mData{{static_cast<uint8_t>(0.5f + r * 255.0f), // round floats to nearest integers
+                 static_cast<uint8_t>(0.5f + g * 255.0f), // double {{}} is needed due to union
+                 static_cast<uint8_t>(0.5f + b * 255.0f),
+                 static_cast<uint8_t>(0.5f + a * 255.0f)}}
+    {
+    }
+
+    /// @brief Vec3f r,g,b ctor (alpha channel it set to 1)
+    /// @note all values should be in the range 0.0f to 1.0f
+    __hostdev__ Rgba8(const Vec3<float>& rgb)
+        : Rgba8(rgb[0], rgb[1], rgb[2])
+    {
+    }
+
+    /// @brief Vec4f r,g,b,a ctor
+    /// @note all values should be in the range 0.0f to 1.0f
+    __hostdev__ Rgba8(const Vec4<float>& rgba)
+        : Rgba8(rgba[0], rgba[1], rgba[2], rgba[3])
+    {
+    }
+
+    __hostdev__ bool  operator< (const Rgba8& rhs) const { return mData.packed < rhs.mData.packed; }
+    __hostdev__ bool  operator==(const Rgba8& rhs) const { return mData.packed == rhs.mData.packed; }
+    __hostdev__ float lengthSqr() const
+    {
+        return 0.0000153787005f * (float(mData.c[0]) * mData.c[0] +
+                                   float(mData.c[1]) * mData.c[1] +
+                                   float(mData.c[2]) * mData.c[2]); //1/255^2
+    }
+    __hostdev__ float           length() const { return sqrtf(this->lengthSqr()); }
+    /// @brief return n'th color channel as a float in the range 0 to 1
+    __hostdev__ float           asFloat(int n) const { return 0.003921569f*float(mData.c[n]); }// divide by 255
+    __hostdev__ const uint8_t&  operator[](int n) const { return mData.c[n]; }
+    __hostdev__ uint8_t&        operator[](int n) { return mData.c[n]; }
+    __hostdev__ const uint32_t& packed() const { return mData.packed; }
+    __hostdev__ uint32_t&       packed() { return mData.packed; }
+    __hostdev__ const uint8_t&  r() const { return mData.c[0]; }
+    __hostdev__ const uint8_t&  g() const { return mData.c[1]; }
+    __hostdev__ const uint8_t&  b() const { return mData.c[2]; }
+    __hostdev__ const uint8_t&  a() const { return mData.c[3]; }
+    __hostdev__ uint8_t&        r() { return mData.c[0]; }
+    __hostdev__ uint8_t&        g() { return mData.c[1]; }
+    __hostdev__ uint8_t&        b() { return mData.c[2]; }
+    __hostdev__ uint8_t&        a() { return mData.c[3]; }
+    __hostdev__                 operator Vec3<float>() const {
+        return Vec3<float>(this->asFloat(0), this->asFloat(1), this->asFloat(2));
+    }
+    __hostdev__                 operator Vec4<float>() const {
+        return Vec4<float>(this->asFloat(0), this->asFloat(1), this->asFloat(2), this->asFloat(3));
+    }
+}; // Rgba8
+
+using Vec3d  = Vec3<double>;
+using Vec3f  = Vec3<float>;
+using Vec3i  = Vec3<int32_t>;
+using Vec3u  = Vec3<uint32_t>;
+using Vec3u8 = Vec3<uint8_t>;
+using Vec3u16 = Vec3<uint16_t>;
+
+using Vec4R  = Vec4<double>;
+using Vec4d  = Vec4<double>;
+using Vec4f  = Vec4<float>;
+using Vec4i  = Vec4<int>;
+
+}// namespace math ===============================================================
+
+using Rgba8 [[deprecated("Use math::Rgba8 instead.")]] = math::Rgba8;
+using math::Coord;
+
+using Vec3d = math::Vec3<double>;
+using Vec3f = math::Vec3<float>;
+using Vec3i = math::Vec3<int32_t>;
+using Vec3u = math::Vec3<uint32_t>;
+using Vec3u8 = math::Vec3<uint8_t>;
+using Vec3u16 = math::Vec3<uint16_t>;
+
+using Vec4R = math::Vec4<double>;
+using Vec4d = math::Vec4<double>;
+using Vec4f = math::Vec4<float>;
+using Vec4i = math::Vec4<int>;
+
+using CoordBBox = math::BBox<Coord>;
+using Vec3dBBox = math::BBox<Vec3d>;
+using BBoxR [[deprecated("Use Vec3dBBox instead.")]] = math::BBox<Vec3d>;
+
+} // namespace nanovdb ===================================================================
+
+#endif // end of NANOVDB_MATH_MATH_H_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/math/Ray.h b/nanovdb/nanovdb/math/Ray.h
new file mode 100644
index 0000000000..9f08288007
--- /dev/null
+++ b/nanovdb/nanovdb/math/Ray.h
@@ -0,0 +1,557 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/// @file Ray.h
+///
+/// @author Ken Museth
+///
+/// @brief A Ray class.
+
+#ifndef NANOVDB_MATH_RAY_H_HAS_BEEN_INCLUDED
+#define NANOVDB_MATH_RAY_H_HAS_BEEN_INCLUDED
+
+#include <nanovdb/NanoVDB.h> // for Vec3
+namespace nanovdb {// ===================================================
+
+namespace math {// ======================================================
+
+template<typename RealT>
+class Ray
+{
+public:
+    using RealType = RealT;
+    using Vec3Type = Vec3<RealT>;
+    using Vec3T = Vec3Type;
+
+    struct TimeSpan
+    {
+        RealT t0, t1;
+        /// @brief Default constructor
+        __hostdev__ TimeSpan() {}
+        /// @brief Constructor
+        __hostdev__ TimeSpan(RealT _t0, RealT _t1)
+            : t0(_t0)
+            , t1(_t1)
+        {
+        }
+        /// @brief Set both times
+        __hostdev__ void set(RealT _t0, RealT _t1)
+        {
+            t0 = _t0;
+            t1 = _t1;
+        }
+        /// @brief Get both times
+        __hostdev__ void get(RealT& _t0, RealT& _t1) const
+        {
+            _t0 = t0;
+            _t1 = t1;
+        }
+        /// @brief Return @c true if t1 is larger than t0 by at least eps.
+        __hostdev__ bool valid(RealT eps = Delta<RealT>::value()) const { return (t1 - t0) > eps; }
+        /// @brief Return the midpoint of the ray.
+        __hostdev__ RealT mid() const { return 0.5 * (t0 + t1); }
+        /// @brief Multiplies both times
+        __hostdev__ void scale(RealT s)
+        {
+            assert(s > 0);
+            t0 *= s;
+            t1 *= s;
+        }
+        /// @brief Return @c true if time is inclusive
+        __hostdev__ bool test(RealT t) const { return (t >= t0 && t <= t1); }
+    };
+
+    __hostdev__ Ray(const Vec3Type& eye = Vec3Type(0, 0, 0),
+                    const Vec3Type& direction = Vec3Type(1, 0, 0),
+                    RealT           t0 = Delta<RealT>::value(),
+                    RealT           t1 = Maximum<RealT>::value())
+        : mEye(eye)
+        , mDir(direction)
+        , mInvDir(1 / mDir[0], 1 / mDir[1], 1 / mDir[2])
+        , mTimeSpan(t0, t1)
+        , mSign{mInvDir[0] < 0, mInvDir[1] < 0, mInvDir[2] < 0}
+    {
+    }
+
+    __hostdev__ Ray& offsetEye(RealT offset)
+    {
+        mEye[0] += offset;
+        mEye[1] += offset;
+        mEye[2] += offset;
+        return *this;
+    }
+
+    __hostdev__ Ray& setEye(const Vec3Type& eye)
+    {
+        mEye = eye;
+        return *this;
+    }
+
+    __hostdev__ Ray& setDir(const Vec3Type& dir)
+    {
+        mDir = dir;
+        mInvDir[0] = 1.0 / mDir[0];
+        mInvDir[1] = 1.0 / mDir[1];
+        mInvDir[2] = 1.0 / mDir[2];
+        mSign[0] = mInvDir[0] < 0;
+        mSign[1] = mInvDir[1] < 0;
+        mSign[2] = mInvDir[2] < 0;
+        return *this;
+    }
+
+    __hostdev__ Ray& setMinTime(RealT t0)
+    {
+        mTimeSpan.t0 = t0;
+        return *this;
+    }
+
+    __hostdev__ Ray& setMaxTime(RealT t1)
+    {
+        mTimeSpan.t1 = t1;
+        return *this;
+    }
+
+    __hostdev__ Ray& setTimes(
+        RealT t0 = Delta<RealT>::value(),
+        RealT t1 = Maximum<RealT>::value())
+    {
+        assert(t0 > 0 && t1 > 0);
+        mTimeSpan.set(t0, t1);
+        return *this;
+    }
+
+    __hostdev__ Ray& scaleTimes(RealT scale)
+    {
+        mTimeSpan.scale(scale);
+        return *this;
+    }
+
+    __hostdev__ Ray& reset(
+        const Vec3Type& eye,
+        const Vec3Type& direction,
+        RealT           t0 = Delta<RealT>::value(),
+        RealT           t1 = Maximum<RealT>::value())
+    {
+        this->setEye(eye);
+        this->setDir(direction);
+        this->setTimes(t0, t1);
+        return *this;
+    }
+
+    __hostdev__ const Vec3T& eye() const { return mEye; }
+
+    __hostdev__ const Vec3T& dir() const { return mDir; }
+
+    __hostdev__ const Vec3T& invDir() const { return mInvDir; }
+
+    __hostdev__ RealT t0() const { return mTimeSpan.t0; }
+
+    __hostdev__ RealT t1() const { return mTimeSpan.t1; }
+
+    __hostdev__ int sign(int i) const { return mSign[i]; }
+
+    /// @brief Return the position along the ray at the specified time.
+    __hostdev__ Vec3T operator()(RealT time) const
+    {
+#if 1
+        return Vec3T(fmaf(time, mDir[0], mEye[0]),
+                     fmaf(time, mDir[1], mEye[1]),
+                     fmaf(time, mDir[2], mEye[2]));
+#else
+        return mEye + mDir * time;
+#endif
+    }
+
+    /// @brief Return the starting point of the ray.
+    __hostdev__ Vec3T start() const { return (*this)(mTimeSpan.t0); }
+
+    /// @brief Return the endpoint of the ray.
+    __hostdev__ Vec3T end() const { return (*this)(mTimeSpan.t1); }
+
+    /// @brief Return the midpoint of the ray.
+    __hostdev__ Vec3T mid() const { return (*this)(mTimeSpan.mid()); }
+
+    /// @brief Return @c true if t1 is larger than t0 by at least eps.
+    __hostdev__ bool valid(RealT eps = Delta<float>::value()) const { return mTimeSpan.valid(eps); }
+
+    /// @brief Return @c true if @a time is within t0 and t1, both inclusive.
+    __hostdev__ bool test(RealT time) const { return mTimeSpan.test(time); }
+
+    /// @brief Return a new Ray that is transformed with the specified map.
+    ///
+    /// @param map  the map from which to construct the new Ray.
+    ///
+    /// @warning Assumes a linear map and a normalized direction.
+    ///
+    /// @details The requirement that the direction is normalized
+    ///          follows from the transformation of t0 and t1 - and that fact that
+    ///          we want applyMap and applyInverseMap to be inverse operations.
+    template<typename MapType>
+    __hostdev__ Ray applyMap(const MapType& map) const
+    {
+        const Vec3T eye = map.applyMap(mEye);
+        const Vec3T dir = map.applyJacobian(mDir);
+        const RealT length = dir.length(), invLength = RealT(1) / length;
+        RealT       t1 = mTimeSpan.t1;
+        if (mTimeSpan.t1 < Maximum<RealT>::value()) {
+            t1 *= length;
+        }
+        return Ray(eye, dir * invLength, length * mTimeSpan.t0, t1);
+    }
+    template<typename MapType>
+    __hostdev__ Ray applyMapF(const MapType& map) const
+    {
+        const Vec3T eye = map.applyMapF(mEye);
+        const Vec3T dir = map.applyJacobianF(mDir);
+        const RealT length = dir.length(), invLength = RealT(1) / length;
+        RealT       t1 = mTimeSpan.t1;
+        if (mTimeSpan.t1 < Maximum<RealT>::value()) {
+            t1 *= length;
+        }
+        return Ray(eye, dir * invLength, length * mTimeSpan.t0, t1);
+    }
+
+    /// @brief Return a new Ray that is transformed with the inverse of the specified map.
+    ///
+    /// @param map  the map from which to construct the new Ray by inverse mapping.
+    ///
+    /// @warning Assumes a linear map and a normalized direction.
+    ///
+    /// @details The requirement that the direction is normalized
+    ///          follows from the transformation of t0 and t1 - and that fact that
+    ///          we want applyMap and applyInverseMap to be inverse operations.
+    template<typename MapType>
+    __hostdev__ Ray applyInverseMap(const MapType& map) const
+    {
+        const Vec3T eye = map.applyInverseMap(mEye);
+        const Vec3T dir = map.applyInverseJacobian(mDir);
+        const RealT length = dir.length(), invLength = RealT(1) / length;
+        return Ray(eye, dir * invLength, length * mTimeSpan.t0, length * mTimeSpan.t1);
+    }
+    template<typename MapType>
+    __hostdev__ Ray applyInverseMapF(const MapType& map) const
+    {
+        const Vec3T eye = map.applyInverseMapF(mEye);
+        const Vec3T dir = map.applyInverseJacobianF(mDir);
+        const RealT length = dir.length(), invLength = RealT(1) / length;
+        return Ray(eye, dir * invLength, length * mTimeSpan.t0, length * mTimeSpan.t1);
+    }
+
+    /// @brief Return a new ray in world space, assuming the existing
+    ///        ray is represented in the index space of the specified grid.
+    template<typename GridType>
+    __hostdev__ Ray indexToWorldF(const GridType& grid) const
+    {
+        const Vec3T eye = grid.indexToWorldF(mEye);
+        const Vec3T dir = grid.indexToWorldDirF(mDir);
+        const RealT length = dir.length(), invLength = RealT(1) / length;
+        RealT       t1 = mTimeSpan.t1;
+        if (mTimeSpan.t1 < Maximum<RealT>::value()) {
+            t1 *= length;
+        }
+        return Ray(eye, dir * invLength, length * mTimeSpan.t0, t1);
+    }
+
+    /// @brief Return a new ray in index space, assuming the existing
+    ///        ray is represented in the world space of the specified grid.
+    template<typename GridType>
+    __hostdev__ Ray worldToIndexF(const GridType& grid) const
+    {
+        const Vec3T eye = grid.worldToIndexF(mEye);
+        const Vec3T dir = grid.worldToIndexDirF(mDir);
+        const RealT length = dir.length(), invLength = RealT(1) / length;
+        RealT       t1 = mTimeSpan.t1;
+        if (mTimeSpan.t1 < Maximum<RealT>::value()) {
+            t1 *= length;
+        }
+        return Ray(eye, dir * invLength, length * mTimeSpan.t0, t1);
+    }
+
+    /// @brief Return true if this ray intersects the specified sphere.
+    ///
+    /// @param center The center of the sphere in the same space as this ray.
+    /// @param radius The radius of the sphere in the same units as this ray.
+    /// @param t0     The first intersection point if an intersection exists.
+    /// @param t1     The second intersection point if an intersection exists.
+    ///
+    /// @note If the return value is true, i.e. a hit, and t0 =
+    ///       this->t0() or t1 == this->t1() only one true intersection exist.
+    __hostdev__ bool intersects(const Vec3T& center, RealT radius, RealT& t0, RealT& t1) const
+    {
+        const Vec3T origin = mEye - center;
+        const RealT A = mDir.lengthSqr();
+        const RealT B = 2 * mDir.dot(origin);
+        const RealT C = origin.lengthSqr() - radius * radius;
+        const RealT D = B * B - 4 * A * C;
+
+        if (D < 0) {
+            return false;
+        }
+        const RealT Q = RealT(-0.5) * (B < 0 ? (B + Sqrt(D)) : (B - Sqrt(D)));
+
+        t0 = Q / A;
+        t1 = C / Q;
+
+        if (t0 > t1) {
+            RealT tmp = t0;
+            t0 = t1;
+            t1 = tmp;
+        }
+        if (t0 < mTimeSpan.t0) {
+            t0 = mTimeSpan.t0;
+        }
+        if (t1 > mTimeSpan.t1) {
+            t1 = mTimeSpan.t1;
+        }
+        return t0 <= t1;
+    }
+
+    /// @brief Return true if this ray intersects the specified sphere.
+    ///
+    /// @param center The center of the sphere in the same space as this ray.
+    /// @param radius The radius of the sphere in the same units as this ray.
+    __hostdev__ bool intersects(const Vec3T& center, RealT radius) const
+    {
+        RealT t0, t1;
+        return this->intersects(center, radius, t0, t1) > 0;
+    }
+
+    /// @brief Return true if this ray intersects the specified sphere.
+    ///
+    /// @note For intersection this ray is clipped to the two intersection points.
+    ///
+    /// @param center The center of the sphere in the same space as this ray.
+    /// @param radius The radius of the sphere in the same units as this ray.
+    __hostdev__ bool clip(const Vec3T& center, RealT radius)
+    {
+        RealT      t0, t1;
+        const bool hit = this->intersects(center, radius, t0, t1);
+        if (hit) {
+            mTimeSpan.set(t0, t1);
+        }
+        return hit;
+    }
+#if 0
+    /// @brief Return true if the Ray intersects the specified
+    ///        axisaligned bounding box.
+    ///
+    /// @param bbox Axis-aligned bounding box in the same space as the Ray.
+    /// @param t0   If an intersection is detected this is assigned
+    ///             the time for the first intersection point.
+    /// @param t1   If an intersection is detected this is assigned
+    ///             the time for the second intersection point.
+    template<typename BBoxT>
+    __hostdev__  bool intersects(const BBoxT& bbox, RealT& t0, RealT& t1) const
+    {
+        t0       = (bbox[  mSign[0]][0] - mEye[0]) * mInvDir[0];
+        RealT t2 = (bbox[1-mSign[1]][1] - mEye[1]) * mInvDir[1];
+        if (t0 > t2) return false;
+        t1       = (bbox[1-mSign[0]][0] - mEye[0]) * mInvDir[0];
+        RealT t3 = (bbox[  mSign[1]][1] - mEye[1]) * mInvDir[1];
+        if (t3 > t1) return false;
+        if (t3 > t0) t0 = t3;
+        if (t2 < t1) t1 = t2;
+        t3 = (bbox[  mSign[2]][2] - mEye[2]) * mInvDir[2];
+        if (t3 > t1) return false;
+        t2 = (bbox[1-mSign[2]][2] - mEye[2]) * mInvDir[2];
+        if (t0 > t2) return false;
+        if (t3 > t0) t0 = t3;
+        if (mTimeSpan.t1 < t0) return false;
+        if (t2 < t1) t1 = t2;
+        if (mTimeSpan.t0 > t1) return false;
+        if (mTimeSpan.t0 > t0) t0 = mTimeSpan.t0;
+        if (mTimeSpan.t1 < t1) t1 = mTimeSpan.t1;
+        return true;
+        /*
+        mTimeSpan.get(_t0, _t1);
+        double t0 = _t0, t1 = _t1;
+        for (int i = 0; i < 3; ++i) {
+            //if (abs(mDir[i])<1e-3) continue;
+            double a = (double(bbox.min()[i]) - mEye[i]) * mInvDir[i];
+            double b = (double(bbox.max()[i]) - mEye[i]) * mInvDir[i];
+            if (a > b) {
+                double tmp = a;
+                a = b;
+                b = tmp;
+            }
+            if (a > t0) t0 = a;
+            if (b < t1) t1 = b;
+            if (t0 > t1) {
+                //if (gVerbose) printf("Missed BBOX: (%i,%i,%i) -> (%i,%i,%i) t0=%f t1=%f\n",
+                //                     bbox.min()[0], bbox.min()[1], bbox.min()[2],
+                //                     bbox.max()[0], bbox.max()[1], bbox.max()[2], t0, t1);
+                return false;
+            }
+        }
+        _t0 = t0; _t1 = t1;
+        return true;
+        */
+    }
+#else
+    /// @brief Returns true if this ray intersects an index bounding box.
+    ///        If the return value is true t0 and t1 are set to the intersection
+    ///        times along the ray.
+    ///
+    /// @warning Intersection with a CoordBBox internally converts to a floating-point bbox
+    ///          which imples that the max is padded with one voxel, i.e. bbox.max += 1! This
+    ///          avoids gaps between neighboring CoordBBox'es, say from neighboring tree nodes.
+    __hostdev__ bool intersects(const CoordBBox& bbox, RealT& t0, RealT& t1) const
+    {
+        mTimeSpan.get(t0, t1);
+        for (int i = 0; i < 3; ++i) {
+            RealT a = RealT(bbox.min()[i]), b = RealT(bbox.max()[i] + 1);
+            if (a >= b) { // empty bounding box
+                return false;
+            }
+            a = (a - mEye[i]) * mInvDir[i];
+            b = (b - mEye[i]) * mInvDir[i];
+            if (a > b) {
+                RealT tmp = a;
+                a = b;
+                b = tmp;
+            }
+            if (a > t0) {
+                t0 = a;
+            }
+            if (b < t1) {
+                t1 = b;
+            }
+            if (t0 > t1) {
+                return false;
+            }
+        }
+        return true;
+    }
+    /// @brief Returns true if this ray intersects a floating-point bounding box.
+    ///        If the return value is true t0 and t1 are set to the intersection
+    ///        times along the ray.
+    template<typename OtherVec3T>
+    __hostdev__ bool intersects(const BBox<OtherVec3T>& bbox, RealT& t0, RealT& t1) const
+    {
+        static_assert(util::is_floating_point<typename OtherVec3T::ValueType>::value, "Ray::intersects: Expected a floating point coordinate");
+        mTimeSpan.get(t0, t1);
+        for (int i = 0; i < 3; ++i) {
+            RealT a = RealT(bbox.min()[i]), b = RealT(bbox.max()[i]);
+            if (a >= b) { // empty bounding box
+                return false;
+            }
+            a = (a - mEye[i]) * mInvDir[i];
+            b = (b - mEye[i]) * mInvDir[i];
+            if (a > b) {
+                RealT tmp = a;
+                a = b;
+                b = tmp;
+            }
+            if (a > t0) {
+                t0 = a;
+            }
+            if (b < t1) {
+                t1 = b;
+            }
+            if (t0 > t1) {
+                return false;
+            }
+        }
+        return true;
+    }
+#endif
+
+    /// @brief Return true if this ray intersects the specified bounding box.
+    ///
+    /// @param bbox Axis-aligned bounding box in the same space as this ray.
+    ///
+    /// @warning If @a bbox is of the type CoordBBox it is converted to a floating-point
+    ///          bounding box, which imples that the max is padded with one voxel, i.e.
+    ///          bbox.max += 1! This avoids gaps between neighboring CoordBBox'es, say
+    ///          from neighboring tree nodes.
+    template<typename BBoxT>
+    __hostdev__ bool intersects(const BBoxT& bbox) const
+    {
+#if 1
+        RealT t0, t1;
+        return this->intersects(bbox, t0, t1);
+#else
+        //BBox<Vec3T> bbox(Vec3T(_bbox[0][0]-1e-4,_bbox[0][1]-1e-4,_bbox[0][2]-1e-4),
+        //                 Vec3T(_bbox[1][0]+1e-4,_bbox[1][1]+1e-4,_bbox[1][2]+1e-4));
+        RealT t0 = (bbox[mSign[0]][0] - mEye[0]) * mInvDir[0];
+        RealT t2 = (bbox[1 - mSign[1]][1] - mEye[1]) * mInvDir[1];
+        if (t0 > t2) return false;
+        RealT t1 = (bbox[1 - mSign[0]][0] - mEye[0]) * mInvDir[0];
+        RealT t3 = (bbox[mSign[1]][1] - mEye[1]) * mInvDir[1];
+        if (t3 > t1) return false;
+        if (t3 > t0) t0 = t3;
+        if (t2 < t1) t1 = t2;
+        t3 = (bbox[mSign[2]][2] - mEye[2]) * mInvDir[2];
+        if (t3 > t1) return false;
+        t2 = (bbox[1 - mSign[2]][2] - mEye[2]) * mInvDir[2];
+        if (t0 > t2) return false;
+        //if (t3 > t0) t0 = t3;
+        //if (mTimeSpan.t1 < t0) return false;
+        //if (t2 < t1) t1 = t2;
+        //return mTimeSpan.t0 < t1;
+        return true;
+#endif
+    }
+
+    /// @brief Return true if this ray intersects the specified bounding box.
+    ///
+    /// @param bbox Axis-aligned bounding box in the same space as this ray.
+    ///
+    /// @warning If @a bbox is of the type CoordBBox it is converted to a floating-point
+    ///          bounding box, which imples that the max is padded with one voxel, i.e.
+    ///          bbox.max += 1! This avoids gaps between neighboring CoordBBox'es, say
+    ///          from neighboring tree nodes.
+    ///
+    /// @note For intersection this ray is clipped to the two intersection points.
+    template<typename BBoxT>
+    __hostdev__ bool clip(const BBoxT& bbox)
+    {
+        RealT      t0, t1;
+        const bool hit = this->intersects(bbox, t0, t1);
+        if (hit) {
+            mTimeSpan.set(t0, t1);
+        }
+        return hit;
+    }
+
+    /// @brief Return true if the Ray intersects the plane specified
+    ///        by a normal and distance from the origin.
+    ///
+    /// @param normal   Normal of the plane.
+    /// @param distance Distance of the plane to the origin.
+    /// @param t        Time of intersection, if one exists.
+    __hostdev__ bool intersects(const Vec3T& normal, RealT distance, RealT& t) const
+    {
+        const RealT cosAngle = mDir.dot(normal);
+        if (isApproxZero(cosAngle)) {
+            return false; // ray is parallel to plane
+        }
+        t = (distance - mEye.dot(normal)) / cosAngle;
+        return this->test(t);
+    }
+
+    /// @brief Return true if the Ray intersects the plane specified
+    ///        by a normal and point.
+    ///
+    /// @param normal   Normal of the plane.
+    /// @param point    Point in the plane.
+    /// @param t        Time of intersection, if one exists.
+    __hostdev__ bool intersects(const Vec3T& normal, const Vec3T& point, RealT& t) const
+    {
+        return this->intersects(normal, point.dot(normal), t);
+    }
+
+private:
+    Vec3T    mEye, mDir, mInvDir;
+    TimeSpan mTimeSpan;
+    int      mSign[3];
+}; // end of Ray class
+
+} // namespace math =========================================================
+
+template<typename RealT>
+using Ray [[deprecated("Use nanovdb::math::Ray instead")]] = math::Ray<RealT>;
+
+} // namespace nanovdb =======================================================
+
+#endif // NANOVDB_MATH_RAY_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/math/SampleFromVoxels.h b/nanovdb/nanovdb/math/SampleFromVoxels.h
new file mode 100644
index 0000000000..e4f1e26018
--- /dev/null
+++ b/nanovdb/nanovdb/math/SampleFromVoxels.h
@@ -0,0 +1,996 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+//////////////////////////////////////////////////////////////////////////
+///
+/// @file SampleFromVoxels.h
+///
+/// @brief NearestNeighborSampler, TrilinearSampler, TriquadraticSampler and TricubicSampler
+///
+/// @note These interpolators employ internal caching for better performance when used repeatedly
+///       in the same voxel location, so try to reuse an instance of these classes more than once.
+///
+/// @warning While all the interpolators defined below work with both scalars and vectors
+///          values (e.g. float and Vec3<float>) TrilinarSampler::zeroCrossing and
+///          Trilinear::gradient will only compile with floating point value types.
+///
+/// @author Ken Museth
+///
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef NANOVDB_SAMPLE_FROM_VOXELS_H_HAS_BEEN_INCLUDED
+#define NANOVDB_SAMPLE_FROM_VOXELS_H_HAS_BEEN_INCLUDED
+
+// Only define __hostdev__ when compiling as NVIDIA CUDA
+#if defined(__CUDACC__) || defined(__HIP__)
+#define __hostdev__ __host__ __device__
+#else
+#include <cmath> // for floor
+#define __hostdev__
+#endif
+
+#include <nanovdb/math/Math.h>
+
+namespace nanovdb {
+
+namespace math {
+
+// Forward declaration of sampler with specific polynomial orders
+template<typename TreeT, int Order, bool UseCache = true>
+class SampleFromVoxels;
+
+/// @brief Factory free-function for a sampler of specific polynomial orders
+///
+/// @details This allows for the compact syntax:
+/// @code
+///   auto acc = grid.getAccessor();
+///   auto smp = nanovdb::math::createSampler<1>( acc );
+/// @endcode
+template<int Order, typename TreeOrAccT, bool UseCache = true>
+__hostdev__ SampleFromVoxels<TreeOrAccT, Order, UseCache> createSampler(const TreeOrAccT& acc)
+{
+    return SampleFromVoxels<TreeOrAccT, Order, UseCache>(acc);
+}
+
+/// @brief Utility function that returns the Coord of the round-down of @a xyz
+///        and redefined @xyz as the fractional part, ie xyz-in = return-value + xyz-out
+template<typename CoordT, typename RealT, template<typename> class Vec3T>
+__hostdev__ inline CoordT Floor(Vec3T<RealT>& xyz);
+
+/// @brief Template specialization of Floor for Vec3<float>
+template<typename CoordT, template<typename> class Vec3T>
+__hostdev__ inline CoordT Floor(Vec3T<float>& xyz)
+{
+    const float ijk[3] = {floorf(xyz[0]), floorf(xyz[1]), floorf(xyz[2])};
+    xyz[0] -= ijk[0];
+    xyz[1] -= ijk[1];
+    xyz[2] -= ijk[2];
+    return CoordT(int32_t(ijk[0]), int32_t(ijk[1]), int32_t(ijk[2]));
+}
+
+/// @brief Template specialization of Floor for Vec3<float>
+template<typename CoordT, template<typename> class Vec3T>
+__hostdev__ inline CoordT Floor(Vec3T<double>& xyz)
+{
+    const double ijk[3] = {floor(xyz[0]), floor(xyz[1]), floor(xyz[2])};
+    xyz[0] -= ijk[0];
+    xyz[1] -= ijk[1];
+    xyz[2] -= ijk[2];
+    return CoordT(int32_t(ijk[0]), int32_t(ijk[1]), int32_t(ijk[2]));
+}
+
+// ------------------------------> NearestNeighborSampler <--------------------------------------
+
+/// @brief Nearest neighbor, i.e. zero order, interpolator with caching
+template<typename TreeOrAccT>
+class SampleFromVoxels<TreeOrAccT, 0, true>
+{
+public:
+    using ValueT = typename TreeOrAccT::ValueType;
+    using CoordT = typename TreeOrAccT::CoordType;
+
+    static const int ORDER = 0;
+    /// @brief Construction from a Tree or ReadAccessor
+    __hostdev__ SampleFromVoxels(const TreeOrAccT& acc)
+        : mAcc(acc)
+        , mPos(CoordT::max())
+    {
+    }
+
+    __hostdev__ const TreeOrAccT& accessor() const { return mAcc; }
+
+    /// @note xyz is in index space space
+    template<typename Vec3T>
+    inline __hostdev__ ValueT operator()(const Vec3T& xyz) const;
+
+    inline __hostdev__ ValueT operator()(const CoordT& ijk) const;
+
+private:
+    const TreeOrAccT& mAcc;
+    mutable CoordT    mPos;
+    mutable ValueT    mVal; // private cache
+}; // SampleFromVoxels<TreeOrAccT, 0, true>
+
+/// @brief Nearest neighbor, i.e. zero order, interpolator without caching
+template<typename TreeOrAccT>
+class SampleFromVoxels<TreeOrAccT, 0, false>
+{
+public:
+    using ValueT = typename TreeOrAccT::ValueType;
+    using CoordT = typename TreeOrAccT::CoordType;
+    static const int ORDER = 0;
+
+    /// @brief Construction from a Tree or ReadAccessor
+    __hostdev__ SampleFromVoxels(const TreeOrAccT& acc)
+        : mAcc(acc)
+    {
+    }
+
+    __hostdev__ const TreeOrAccT& accessor() const { return mAcc; }
+
+    /// @note xyz is in index space space
+    template<typename Vec3T>
+    inline __hostdev__ ValueT operator()(const Vec3T& xyz) const;
+
+    inline __hostdev__ ValueT operator()(const CoordT& ijk) const { return mAcc.getValue(ijk);}
+
+private:
+    const TreeOrAccT& mAcc;
+}; // SampleFromVoxels<TreeOrAccT, 0, false>
+
+template<typename TreeOrAccT>
+template<typename Vec3T>
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 0, true>::operator()(const Vec3T& xyz) const
+{
+    const CoordT ijk = math::Round<CoordT>(xyz);
+    if (ijk != mPos) {
+        mPos = ijk;
+        mVal = mAcc.getValue(mPos);
+    }
+    return mVal;
+}
+
+template<typename TreeOrAccT>
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 0, true>::operator()(const CoordT& ijk) const
+{
+    if (ijk != mPos) {
+        mPos = ijk;
+        mVal = mAcc.getValue(mPos);
+    }
+    return mVal;
+}
+
+template<typename TreeOrAccT>
+template<typename Vec3T>
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 0, false>::operator()(const Vec3T& xyz) const
+{
+    return mAcc.getValue(math::Round<CoordT>(xyz));
+}
+
+// ------------------------------> TrilinearSampler <--------------------------------------
+
+/// @brief Tri-linear sampler, i.e. first order, interpolator
+template<typename TreeOrAccT>
+class TrilinearSampler
+{
+protected:
+    const TreeOrAccT& mAcc;
+
+public:
+    using ValueT = typename TreeOrAccT::ValueType;
+    using CoordT = typename TreeOrAccT::CoordType;
+    static const int ORDER = 1;
+
+    /// @brief Protected constructor from a Tree or ReadAccessor
+    __hostdev__ TrilinearSampler(const TreeOrAccT& acc) : mAcc(acc) {}
+
+    __hostdev__ const TreeOrAccT& accessor() const { return mAcc; }
+
+    /// @brief Extract the stencil of 8 values
+    inline __hostdev__ void stencil(CoordT& ijk, ValueT (&v)[2][2][2]) const;
+
+    template<typename RealT, template<typename...> class Vec3T>
+    static inline __hostdev__ ValueT sample(const Vec3T<RealT> &uvw, const ValueT (&v)[2][2][2]);
+
+    template<typename RealT, template<typename...> class Vec3T>
+    static inline __hostdev__ Vec3T<ValueT> gradient(const Vec3T<RealT> &uvw, const ValueT (&v)[2][2][2]);
+
+    static inline __hostdev__ bool zeroCrossing(const ValueT (&v)[2][2][2]);
+}; // TrilinearSamplerBase
+
+template<typename TreeOrAccT>
+__hostdev__ void TrilinearSampler<TreeOrAccT>::stencil(CoordT& ijk, ValueT (&v)[2][2][2]) const
+{
+    v[0][0][0] = mAcc.getValue(ijk); // i, j, k
+
+    ijk[2] += 1;
+    v[0][0][1] = mAcc.getValue(ijk); // i, j, k + 1
+
+    ijk[1] += 1;
+    v[0][1][1] = mAcc.getValue(ijk); // i, j+1, k + 1
+
+    ijk[2] -= 1;
+    v[0][1][0] = mAcc.getValue(ijk); // i, j+1, k
+
+    ijk[0] += 1;
+    ijk[1] -= 1;
+    v[1][0][0] = mAcc.getValue(ijk); // i+1, j, k
+
+    ijk[2] += 1;
+    v[1][0][1] = mAcc.getValue(ijk); // i+1, j, k + 1
+
+    ijk[1] += 1;
+    v[1][1][1] = mAcc.getValue(ijk); // i+1, j+1, k + 1
+
+    ijk[2] -= 1;
+    v[1][1][0] = mAcc.getValue(ijk); // i+1, j+1, k
+}
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ typename TreeOrAccT::ValueType TrilinearSampler<TreeOrAccT>::sample(const Vec3T<RealT> &uvw, const ValueT (&v)[2][2][2])
+{
+#if 0
+  auto lerp = [](ValueT a, ValueT b, ValueT w){ return fma(w, b-a, a); };// = w*(b-a) + a
+  //auto lerp = [](ValueT a, ValueT b, ValueT w){ return fma(w, b, fma(-w, a, a));};// = (1-w)*a + w*b
+#else
+    auto lerp = [](ValueT a, ValueT b, RealT w) { return a + ValueT(w) * (b - a); };
+#endif
+    return lerp(lerp(lerp(v[0][0][0], v[0][0][1], uvw[2]), lerp(v[0][1][0], v[0][1][1], uvw[2]), uvw[1]),
+                lerp(lerp(v[1][0][0], v[1][0][1], uvw[2]), lerp(v[1][1][0], v[1][1][1], uvw[2]), uvw[1]),
+                uvw[0]);
+}
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ Vec3T<typename TreeOrAccT::ValueType> TrilinearSampler<TreeOrAccT>::gradient(const Vec3T<RealT> &uvw, const ValueT (&v)[2][2][2])
+{
+    static_assert(util::is_floating_point<ValueT>::value, "TrilinearSampler::gradient requires a floating-point type");
+#if 0
+  auto lerp = [](ValueT a, ValueT b, ValueT w){ return fma(w, b-a, a); };// = w*(b-a) + a
+  //auto lerp = [](ValueT a, ValueT b, ValueT w){ return fma(w, b, fma(-w, a, a));};// = (1-w)*a + w*b
+#else
+    auto lerp = [](ValueT a, ValueT b, RealT w) { return a + ValueT(w) * (b - a); };
+#endif
+
+    ValueT D[4] = {v[0][0][1] - v[0][0][0], v[0][1][1] - v[0][1][0], v[1][0][1] - v[1][0][0], v[1][1][1] - v[1][1][0]};
+
+    // Z component
+    Vec3T<ValueT> grad(0, 0, lerp(lerp(D[0], D[1], uvw[1]), lerp(D[2], D[3], uvw[1]), uvw[0]));
+
+    const ValueT w = ValueT(uvw[2]);
+    D[0] = v[0][0][0] + D[0] * w;
+    D[1] = v[0][1][0] + D[1] * w;
+    D[2] = v[1][0][0] + D[2] * w;
+    D[3] = v[1][1][0] + D[3] * w;
+
+    // X component
+    grad[0] = lerp(D[2], D[3], uvw[1]) - lerp(D[0], D[1], uvw[1]);
+
+    // Y component
+    grad[1] = lerp(D[1] - D[0], D[3] - D[2], uvw[0]);
+
+    return grad;
+}
+
+template<typename TreeOrAccT>
+__hostdev__ bool TrilinearSampler<TreeOrAccT>::zeroCrossing(const ValueT (&v)[2][2][2])
+{
+    static_assert(util::is_floating_point<ValueT>::value, "TrilinearSampler::zeroCrossing requires a floating-point type");
+    const bool less = v[0][0][0] < ValueT(0);
+    return (less ^ (v[0][0][1] < ValueT(0))) ||
+           (less ^ (v[0][1][1] < ValueT(0))) ||
+           (less ^ (v[0][1][0] < ValueT(0))) ||
+           (less ^ (v[1][0][0] < ValueT(0))) ||
+           (less ^ (v[1][0][1] < ValueT(0))) ||
+           (less ^ (v[1][1][1] < ValueT(0))) ||
+           (less ^ (v[1][1][0] < ValueT(0)));
+}
+
+/// @brief Template specialization that does not use caching of stencil points
+template<typename TreeOrAccT>
+class SampleFromVoxels<TreeOrAccT, 1, false> : public TrilinearSampler<TreeOrAccT>
+{
+    using BaseT = TrilinearSampler<TreeOrAccT>;
+    using ValueT = typename TreeOrAccT::ValueType;
+    using CoordT = typename TreeOrAccT::CoordType;
+
+public:
+
+    /// @brief Construction from a Tree or ReadAccessor
+    __hostdev__ SampleFromVoxels(const TreeOrAccT& acc) : BaseT(acc) {}
+
+    /// @note xyz is in index space space
+    template<typename RealT, template<typename...> class Vec3T>
+    inline __hostdev__ ValueT operator()(Vec3T<RealT> xyz) const;
+
+    /// @note ijk is in index space space
+    __hostdev__ ValueT operator()(const CoordT &ijk) const {return BaseT::mAcc.getValue(ijk);}
+
+    /// @brief Return the gradient in index space.
+    ///
+    /// @warning Will only compile with floating point value types
+    template<typename RealT, template<typename...> class Vec3T>
+    inline __hostdev__ Vec3T<ValueT> gradient(Vec3T<RealT> xyz) const;
+
+    /// @brief Return true if the tr-linear stencil has a zero crossing at the specified index position.
+    ///
+    /// @warning Will only compile with floating point value types
+    template<typename RealT, template<typename...> class Vec3T>
+    inline __hostdev__ bool zeroCrossing(Vec3T<RealT> xyz) const;
+
+}; // SampleFromVoxels<TreeOrAccT, 1, false>
+
+/// @brief Template specialization with caching of stencil values
+template<typename TreeOrAccT>
+class SampleFromVoxels<TreeOrAccT, 1, true> : public TrilinearSampler<TreeOrAccT>
+{
+    using BaseT = TrilinearSampler<TreeOrAccT>;
+    using ValueT = typename TreeOrAccT::ValueType;
+    using CoordT = typename TreeOrAccT::CoordType;
+
+    mutable CoordT mPos;
+    mutable ValueT mVal[2][2][2];
+
+    template<typename RealT, template<typename...> class Vec3T>
+    __hostdev__ void cache(Vec3T<RealT>& xyz) const;
+public:
+
+    /// @brief Construction from a Tree or ReadAccessor
+    __hostdev__ SampleFromVoxels(const TreeOrAccT& acc) : BaseT(acc), mPos(CoordT::max()){}
+
+    /// @note xyz is in index space space
+    template<typename RealT, template<typename...> class Vec3T>
+    inline __hostdev__ ValueT operator()(Vec3T<RealT> xyz) const;
+
+    // @note ijk is in index space space
+    __hostdev__ ValueT operator()(const CoordT &ijk) const;
+
+    /// @brief Return the gradient in index space.
+    ///
+    /// @warning Will only compile with floating point value types
+    template<typename RealT, template<typename...> class Vec3T>
+    inline __hostdev__ Vec3T<ValueT> gradient(Vec3T<RealT> xyz) const;
+
+    /// @brief Return true if the tr-linear stencil has a zero crossing at the specified index position.
+    ///
+    /// @warning Will only compile with floating point value types
+    template<typename RealT, template<typename...> class Vec3T>
+    inline __hostdev__ bool zeroCrossing(Vec3T<RealT> xyz) const;
+
+    /// @brief Return true if the cached tri-linear stencil has a zero crossing.
+    ///
+    /// @warning Will only compile with floating point value types
+    __hostdev__ bool zeroCrossing() const { return BaseT::zeroCrossing(mVal); }
+
+}; // SampleFromVoxels<TreeOrAccT, 1, true>
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 1, true>::operator()(Vec3T<RealT> xyz) const
+{
+    this->cache(xyz);
+    return BaseT::sample(xyz, mVal);
+}
+
+template<typename TreeOrAccT>
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 1, true>::operator()(const CoordT &ijk) const
+{
+    return  ijk == mPos ? mVal[0][0][0] : BaseT::mAcc.getValue(ijk);
+}
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ Vec3T<typename TreeOrAccT::ValueType> SampleFromVoxels<TreeOrAccT, 1, true>::gradient(Vec3T<RealT> xyz) const
+{
+    this->cache(xyz);
+    return BaseT::gradient(xyz, mVal);
+}
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ bool SampleFromVoxels<TreeOrAccT, 1, true>::zeroCrossing(Vec3T<RealT> xyz) const
+{
+    this->cache(xyz);
+    return BaseT::zeroCrossing(mVal);
+}
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ void SampleFromVoxels<TreeOrAccT, 1, true>::cache(Vec3T<RealT>& xyz) const
+{
+    CoordT ijk = Floor<CoordT>(xyz);
+    if (ijk != mPos) {
+        mPos = ijk;
+        BaseT::stencil(ijk, mVal);
+    }
+}
+
+#if 0
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 1, false>::operator()(Vec3T<RealT> xyz) const
+{
+    ValueT val[2][2][2];
+    CoordT ijk = Floor<CoordT>(xyz);
+    BaseT::stencil(ijk, val);
+    return BaseT::sample(xyz, val);
+}
+
+#else
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 1, false>::operator()(Vec3T<RealT> xyz) const
+{
+    auto lerp = [](ValueT a, ValueT b, RealT w) { return a + ValueT(w) * (b - a); };
+
+    CoordT coord = Floor<CoordT>(xyz);
+
+    ValueT vx, vx1, vy, vy1, vz, vz1;
+
+    vz = BaseT::mAcc.getValue(coord);
+    coord[2] += 1;
+    vz1 = BaseT::mAcc.getValue(coord);
+    vy = lerp(vz, vz1, xyz[2]);
+
+    coord[1] += 1;
+
+    vz1 = BaseT::mAcc.getValue(coord);
+    coord[2] -= 1;
+    vz = BaseT::mAcc.getValue(coord);
+    vy1 = lerp(vz, vz1, xyz[2]);
+
+    vx = lerp(vy, vy1, xyz[1]);
+
+    coord[0] += 1;
+
+    vz = BaseT::mAcc.getValue(coord);
+    coord[2] += 1;
+    vz1 = BaseT::mAcc.getValue(coord);
+    vy1 = lerp(vz, vz1, xyz[2]);
+
+    coord[1] -= 1;
+
+    vz1 = BaseT::mAcc.getValue(coord);
+    coord[2] -= 1;
+    vz = BaseT::mAcc.getValue(coord);
+    vy = lerp(vz, vz1, xyz[2]);
+
+    vx1 = lerp(vy, vy1, xyz[1]);
+
+    return lerp(vx, vx1, xyz[0]);
+}
+#endif
+
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ inline Vec3T<typename TreeOrAccT::ValueType> SampleFromVoxels<TreeOrAccT, 1, false>::gradient(Vec3T<RealT> xyz) const
+{
+    ValueT val[2][2][2];
+    CoordT ijk = Floor<CoordT>(xyz);
+    BaseT::stencil(ijk, val);
+    return BaseT::gradient(xyz, val);
+}
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ bool SampleFromVoxels<TreeOrAccT, 1, false>::zeroCrossing(Vec3T<RealT> xyz) const
+{
+    ValueT val[2][2][2];
+    CoordT ijk = Floor<CoordT>(xyz);
+    BaseT::stencil(ijk, val);
+    return BaseT::zeroCrossing(val);
+}
+
+// ------------------------------> TriquadraticSampler <--------------------------------------
+
+/// @brief Tri-quadratic sampler, i.e. second order, interpolator
+template<typename TreeOrAccT>
+class TriquadraticSampler
+{
+protected:
+    const TreeOrAccT& mAcc;
+
+public:
+    using ValueT = typename TreeOrAccT::ValueType;
+    using CoordT = typename TreeOrAccT::CoordType;
+    static const int ORDER = 1;
+
+    /// @brief Protected constructor from a Tree or ReadAccessor
+    __hostdev__ TriquadraticSampler(const TreeOrAccT& acc) : mAcc(acc) {}
+
+    __hostdev__ const TreeOrAccT& accessor() const { return mAcc; }
+
+    /// @brief Extract the stencil of 27 values
+    inline __hostdev__ void stencil(const CoordT &ijk, ValueT (&v)[3][3][3]) const;
+
+    template<typename RealT, template<typename...> class Vec3T>
+    static inline __hostdev__ ValueT sample(const Vec3T<RealT> &uvw, const ValueT (&v)[3][3][3]);
+
+    static inline __hostdev__ bool zeroCrossing(const ValueT (&v)[3][3][3]);
+}; // TriquadraticSamplerBase
+
+template<typename TreeOrAccT>
+__hostdev__ void TriquadraticSampler<TreeOrAccT>::stencil(const CoordT &ijk, ValueT (&v)[3][3][3]) const
+{
+    CoordT p(ijk[0] - 1, 0, 0);
+    for (int dx = 0; dx < 3; ++dx, ++p[0]) {
+        p[1] = ijk[1] - 1;
+        for (int dy = 0; dy < 3; ++dy, ++p[1]) {
+            p[2] = ijk[2] - 1;
+            for (int dz = 0; dz < 3; ++dz, ++p[2]) {
+                v[dx][dy][dz] = mAcc.getValue(p);// extract the stencil of 27 values
+            }
+        }
+    }
+}
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ typename TreeOrAccT::ValueType TriquadraticSampler<TreeOrAccT>::sample(const Vec3T<RealT> &uvw, const ValueT (&v)[3][3][3])
+{
+    auto kernel = [](const ValueT* value, double weight)->ValueT {
+        return weight * (weight * (0.5f * (value[0] + value[2]) - value[1]) +
+                        0.5f * (value[2] - value[0])) + value[1];
+    };
+
+    ValueT vx[3];
+    for (int dx = 0; dx < 3; ++dx) {
+        ValueT vy[3];
+        for (int dy = 0; dy < 3; ++dy) {
+            vy[dy] = kernel(&v[dx][dy][0], uvw[2]);
+        }//loop over y
+        vx[dx] = kernel(vy, uvw[1]);
+    }//loop over x
+    return kernel(vx, uvw[0]);
+}
+
+template<typename TreeOrAccT>
+__hostdev__ bool TriquadraticSampler<TreeOrAccT>::zeroCrossing(const ValueT (&v)[3][3][3])
+{
+    static_assert(util::is_floating_point<ValueT>::value, "TrilinearSampler::zeroCrossing requires a floating-point type");
+    const bool less = v[0][0][0] < ValueT(0);
+    for (int dx = 0; dx < 3; ++dx) {
+        for (int dy = 0; dy < 3; ++dy) {
+            for (int dz = 0; dz < 3; ++dz) {
+                if (less ^ (v[dx][dy][dz] < ValueT(0))) return true;
+            }
+        }
+    }
+    return false;
+}
+
+/// @brief Template specialization that does not use caching of stencil points
+template<typename TreeOrAccT>
+class SampleFromVoxels<TreeOrAccT, 2, false> : public TriquadraticSampler<TreeOrAccT>
+{
+    using BaseT = TriquadraticSampler<TreeOrAccT>;
+    using ValueT = typename TreeOrAccT::ValueType;
+    using CoordT = typename TreeOrAccT::CoordType;
+public:
+
+    /// @brief Construction from a Tree or ReadAccessor
+    __hostdev__ SampleFromVoxels(const TreeOrAccT& acc) : BaseT(acc) {}
+
+    /// @note xyz is in index space space
+    template<typename RealT, template<typename...> class Vec3T>
+    inline __hostdev__ ValueT operator()(Vec3T<RealT> xyz) const;
+
+    __hostdev__ ValueT operator()(const CoordT &ijk) const {return BaseT::mAcc.getValue(ijk);}
+
+    /// @brief Return true if the tr-linear stencil has a zero crossing at the specified index position.
+    ///
+    /// @warning Will only compile with floating point value types
+    template<typename RealT, template<typename...> class Vec3T>
+    inline __hostdev__ bool zeroCrossing(Vec3T<RealT> xyz) const;
+
+}; // SampleFromVoxels<TreeOrAccT, 2, false>
+
+/// @brief Template specialization with caching of stencil values
+template<typename TreeOrAccT>
+class SampleFromVoxels<TreeOrAccT, 2, true> : public TriquadraticSampler<TreeOrAccT>
+{
+    using BaseT = TriquadraticSampler<TreeOrAccT>;
+    using ValueT = typename TreeOrAccT::ValueType;
+    using CoordT = typename TreeOrAccT::CoordType;
+
+    mutable CoordT mPos;
+    mutable ValueT mVal[3][3][3];
+
+    template<typename RealT, template<typename...> class Vec3T>
+    __hostdev__ void cache(Vec3T<RealT>& xyz) const;
+public:
+
+    /// @brief Construction from a Tree or ReadAccessor
+    __hostdev__ SampleFromVoxels(const TreeOrAccT& acc) : BaseT(acc), mPos(CoordT::max()){}
+
+    /// @note xyz is in index space space
+    template<typename RealT, template<typename...> class Vec3T>
+    inline __hostdev__ ValueT operator()(Vec3T<RealT> xyz) const;
+
+    inline __hostdev__ ValueT operator()(const CoordT &ijk) const;
+
+    /// @brief Return true if the tr-linear stencil has a zero crossing at the specified index position.
+    ///
+    /// @warning Will only compile with floating point value types
+    template<typename RealT, template<typename...> class Vec3T>
+    inline __hostdev__ bool zeroCrossing(Vec3T<RealT> xyz) const;
+
+    /// @brief Return true if the cached tri-linear stencil has a zero crossing.
+    ///
+    /// @warning Will only compile with floating point value types
+    __hostdev__ bool zeroCrossing() const { return BaseT::zeroCrossing(mVal); }
+
+}; // SampleFromVoxels<TreeOrAccT, 2, true>
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 2, true>::operator()(Vec3T<RealT> xyz) const
+{
+    this->cache(xyz);
+    return BaseT::sample(xyz, mVal);
+}
+
+template<typename TreeOrAccT>
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 2, true>::operator()(const CoordT &ijk) const
+{
+    return  ijk == mPos ? mVal[1][1][1] : BaseT::mAcc.getValue(ijk);
+}
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ bool SampleFromVoxels<TreeOrAccT, 2, true>::zeroCrossing(Vec3T<RealT> xyz) const
+{
+    this->cache(xyz);
+    return BaseT::zeroCrossing(mVal);
+}
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ void SampleFromVoxels<TreeOrAccT, 2, true>::cache(Vec3T<RealT>& xyz) const
+{
+    CoordT ijk = Floor<CoordT>(xyz);
+    if (ijk != mPos) {
+        mPos = ijk;
+        BaseT::stencil(ijk, mVal);
+    }
+}
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 2, false>::operator()(Vec3T<RealT> xyz) const
+{
+    ValueT val[3][3][3];
+    CoordT ijk = Floor<CoordT>(xyz);
+    BaseT::stencil(ijk, val);
+    return BaseT::sample(xyz, val);
+}
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ bool SampleFromVoxels<TreeOrAccT, 2, false>::zeroCrossing(Vec3T<RealT> xyz) const
+{
+    ValueT val[3][3][3];
+    CoordT ijk = Floor<CoordT>(xyz);
+    BaseT::stencil(ijk, val);
+    return BaseT::zeroCrossing(val);
+}
+
+// ------------------------------> TricubicSampler <--------------------------------------
+
+/// @brief Tri-cubic sampler, i.e. third order, interpolator.
+///
+/// @details See the following paper for implementation details:
+/// Lekien, F. and Marsden, J.: Tricubic interpolation in three dimensions.
+///                         In: International Journal for Numerical Methods
+///                         in Engineering (2005), No. 63, p. 455-471
+
+template<typename TreeOrAccT>
+class TricubicSampler
+{
+protected:
+    using ValueT = typename TreeOrAccT::ValueType;
+    using CoordT = typename TreeOrAccT::CoordType;
+
+    const TreeOrAccT& mAcc;
+
+public:
+    /// @brief Construction from a Tree or ReadAccessor
+    __hostdev__ TricubicSampler(const TreeOrAccT& acc)
+        : mAcc(acc)
+    {
+    }
+
+    __hostdev__ const TreeOrAccT& accessor() const { return mAcc; }
+
+     /// @brief Extract the stencil of 8 values
+    inline __hostdev__ void stencil(const CoordT& ijk, ValueT (&c)[64]) const;
+
+    template<typename RealT, template<typename...> class Vec3T>
+    static inline __hostdev__ ValueT sample(const Vec3T<RealT> &uvw, const ValueT (&c)[64]);
+}; // TricubicSampler
+
+template<typename TreeOrAccT>
+__hostdev__ void TricubicSampler<TreeOrAccT>::stencil(const CoordT& ijk, ValueT (&C)[64]) const
+{
+    auto fetch = [&](int i, int j, int k) -> ValueT& { return C[((i + 1) << 4) + ((j + 1) << 2) + k + 1]; };
+
+    // fetch 64 point stencil values
+    for (int i = -1; i < 3; ++i) {
+        for (int j = -1; j < 3; ++j) {
+            fetch(i, j, -1) = mAcc.getValue(ijk + CoordT(i, j, -1));
+            fetch(i, j,  0) = mAcc.getValue(ijk + CoordT(i, j,  0));
+            fetch(i, j,  1) = mAcc.getValue(ijk + CoordT(i, j,  1));
+            fetch(i, j,  2) = mAcc.getValue(ijk + CoordT(i, j,  2));
+        }
+    }
+    const ValueT half(0.5), quarter(0.25), eighth(0.125);
+    const ValueT X[64] = {// values of f(x,y,z) at the 8 corners (each from 1 stencil value).
+                          fetch(0, 0, 0),
+                          fetch(1, 0, 0),
+                          fetch(0, 1, 0),
+                          fetch(1, 1, 0),
+                          fetch(0, 0, 1),
+                          fetch(1, 0, 1),
+                          fetch(0, 1, 1),
+                          fetch(1, 1, 1),
+                          // values of df/dx at the 8 corners (each from 2 stencil values).
+                          half * (fetch(1, 0, 0) - fetch(-1, 0, 0)),
+                          half * (fetch(2, 0, 0) - fetch(0, 0, 0)),
+                          half * (fetch(1, 1, 0) - fetch(-1, 1, 0)),
+                          half * (fetch(2, 1, 0) - fetch(0, 1, 0)),
+                          half * (fetch(1, 0, 1) - fetch(-1, 0, 1)),
+                          half * (fetch(2, 0, 1) - fetch(0, 0, 1)),
+                          half * (fetch(1, 1, 1) - fetch(-1, 1, 1)),
+                          half * (fetch(2, 1, 1) - fetch(0, 1, 1)),
+                          // values of df/dy at the 8 corners (each from 2 stencil values).
+                          half * (fetch(0, 1, 0) - fetch(0, -1, 0)),
+                          half * (fetch(1, 1, 0) - fetch(1, -1, 0)),
+                          half * (fetch(0, 2, 0) - fetch(0, 0, 0)),
+                          half * (fetch(1, 2, 0) - fetch(1, 0, 0)),
+                          half * (fetch(0, 1, 1) - fetch(0, -1, 1)),
+                          half * (fetch(1, 1, 1) - fetch(1, -1, 1)),
+                          half * (fetch(0, 2, 1) - fetch(0, 0, 1)),
+                          half * (fetch(1, 2, 1) - fetch(1, 0, 1)),
+                          // values of df/dz at the 8 corners (each from 2 stencil values).
+                          half * (fetch(0, 0, 1) - fetch(0, 0, -1)),
+                          half * (fetch(1, 0, 1) - fetch(1, 0, -1)),
+                          half * (fetch(0, 1, 1) - fetch(0, 1, -1)),
+                          half * (fetch(1, 1, 1) - fetch(1, 1, -1)),
+                          half * (fetch(0, 0, 2) - fetch(0, 0, 0)),
+                          half * (fetch(1, 0, 2) - fetch(1, 0, 0)),
+                          half * (fetch(0, 1, 2) - fetch(0, 1, 0)),
+                          half * (fetch(1, 1, 2) - fetch(1, 1, 0)),
+                          // values of d2f/dxdy at the 8 corners (each from 4 stencil values).
+                          quarter * (fetch(1, 1, 0) - fetch(-1, 1, 0) - fetch(1, -1, 0) + fetch(-1, -1, 0)),
+                          quarter * (fetch(2, 1, 0) - fetch(0, 1, 0) - fetch(2, -1, 0) + fetch(0, -1, 0)),
+                          quarter * (fetch(1, 2, 0) - fetch(-1, 2, 0) - fetch(1, 0, 0) + fetch(-1, 0, 0)),
+                          quarter * (fetch(2, 2, 0) - fetch(0, 2, 0) - fetch(2, 0, 0) + fetch(0, 0, 0)),
+                          quarter * (fetch(1, 1, 1) - fetch(-1, 1, 1) - fetch(1, -1, 1) + fetch(-1, -1, 1)),
+                          quarter * (fetch(2, 1, 1) - fetch(0, 1, 1) - fetch(2, -1, 1) + fetch(0, -1, 1)),
+                          quarter * (fetch(1, 2, 1) - fetch(-1, 2, 1) - fetch(1, 0, 1) + fetch(-1, 0, 1)),
+                          quarter * (fetch(2, 2, 1) - fetch(0, 2, 1) - fetch(2, 0, 1) + fetch(0, 0, 1)),
+                          // values of d2f/dxdz at the 8 corners (each from 4 stencil values).
+                          quarter * (fetch(1, 0, 1) - fetch(-1, 0, 1) - fetch(1, 0, -1) + fetch(-1, 0, -1)),
+                          quarter * (fetch(2, 0, 1) - fetch(0, 0, 1) - fetch(2, 0, -1) + fetch(0, 0, -1)),
+                          quarter * (fetch(1, 1, 1) - fetch(-1, 1, 1) - fetch(1, 1, -1) + fetch(-1, 1, -1)),
+                          quarter * (fetch(2, 1, 1) - fetch(0, 1, 1) - fetch(2, 1, -1) + fetch(0, 1, -1)),
+                          quarter * (fetch(1, 0, 2) - fetch(-1, 0, 2) - fetch(1, 0, 0) + fetch(-1, 0, 0)),
+                          quarter * (fetch(2, 0, 2) - fetch(0, 0, 2) - fetch(2, 0, 0) + fetch(0, 0, 0)),
+                          quarter * (fetch(1, 1, 2) - fetch(-1, 1, 2) - fetch(1, 1, 0) + fetch(-1, 1, 0)),
+                          quarter * (fetch(2, 1, 2) - fetch(0, 1, 2) - fetch(2, 1, 0) + fetch(0, 1, 0)),
+                          // values of d2f/dydz at the 8 corners (each from 4 stencil values).
+                          quarter * (fetch(0, 1, 1) - fetch(0, -1, 1) - fetch(0, 1, -1) + fetch(0, -1, -1)),
+                          quarter * (fetch(1, 1, 1) - fetch(1, -1, 1) - fetch(1, 1, -1) + fetch(1, -1, -1)),
+                          quarter * (fetch(0, 2, 1) - fetch(0, 0, 1) - fetch(0, 2, -1) + fetch(0, 0, -1)),
+                          quarter * (fetch(1, 2, 1) - fetch(1, 0, 1) - fetch(1, 2, -1) + fetch(1, 0, -1)),
+                          quarter * (fetch(0, 1, 2) - fetch(0, -1, 2) - fetch(0, 1, 0) + fetch(0, -1, 0)),
+                          quarter * (fetch(1, 1, 2) - fetch(1, -1, 2) - fetch(1, 1, 0) + fetch(1, -1, 0)),
+                          quarter * (fetch(0, 2, 2) - fetch(0, 0, 2) - fetch(0, 2, 0) + fetch(0, 0, 0)),
+                          quarter * (fetch(1, 2, 2) - fetch(1, 0, 2) - fetch(1, 2, 0) + fetch(1, 0, 0)),
+                          // values of d3f/dxdydz at the 8 corners (each from 8 stencil values).
+                          eighth * (fetch(1, 1, 1) - fetch(-1, 1, 1) - fetch(1, -1, 1) + fetch(-1, -1, 1) - fetch(1, 1, -1) + fetch(-1, 1, -1) + fetch(1, -1, -1) - fetch(-1, -1, -1)),
+                          eighth * (fetch(2, 1, 1) - fetch(0, 1, 1) - fetch(2, -1, 1) + fetch(0, -1, 1) - fetch(2, 1, -1) + fetch(0, 1, -1) + fetch(2, -1, -1) - fetch(0, -1, -1)),
+                          eighth * (fetch(1, 2, 1) - fetch(-1, 2, 1) - fetch(1, 0, 1) + fetch(-1, 0, 1) - fetch(1, 2, -1) + fetch(-1, 2, -1) + fetch(1, 0, -1) - fetch(-1, 0, -1)),
+                          eighth * (fetch(2, 2, 1) - fetch(0, 2, 1) - fetch(2, 0, 1) + fetch(0, 0, 1) - fetch(2, 2, -1) + fetch(0, 2, -1) + fetch(2, 0, -1) - fetch(0, 0, -1)),
+                          eighth * (fetch(1, 1, 2) - fetch(-1, 1, 2) - fetch(1, -1, 2) + fetch(-1, -1, 2) - fetch(1, 1, 0) + fetch(-1, 1, 0) + fetch(1, -1, 0) - fetch(-1, -1, 0)),
+                          eighth * (fetch(2, 1, 2) - fetch(0, 1, 2) - fetch(2, -1, 2) + fetch(0, -1, 2) - fetch(2, 1, 0) + fetch(0, 1, 0) + fetch(2, -1, 0) - fetch(0, -1, 0)),
+                          eighth * (fetch(1, 2, 2) - fetch(-1, 2, 2) - fetch(1, 0, 2) + fetch(-1, 0, 2) - fetch(1, 2, 0) + fetch(-1, 2, 0) + fetch(1, 0, 0) - fetch(-1, 0, 0)),
+                          eighth * (fetch(2, 2, 2) - fetch(0, 2, 2) - fetch(2, 0, 2) + fetch(0, 0, 2) - fetch(2, 2, 0) + fetch(0, 2, 0) + fetch(2, 0, 0) - fetch(0, 0, 0))};
+
+    // 4Kb of static table (int8_t has a range of -127 -> 127 which suffices)
+    static const int8_t A[64][64] = {
+        {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {-3, 3, 0, 0, 0, 0, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {2, -2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {9, -9, -9, 9, 0, 0, 0, 0, 6, 3, -6, -3, 0, 0, 0, 0, 6, -6, 3, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {-6, 6, 6, -6, 0, 0, 0, 0, -3, -3, 3, 3, 0, 0, 0, 0, -4, 4, -2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -2, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {-6, 6, 6, -6, 0, 0, 0, 0, -4, -2, 4, 2, 0, 0, 0, 0, -3, 3, -3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -1, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {4, -4, -4, 4, 0, 0, 0, 0, 2, 2, -2, -2, 0, 0, 0, 0, 2, -2, 2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, 0, 0, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, -9, -9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 3, -6, -3, 0, 0, 0, 0, 6, -6, 3, -3, 0, 0, 0, 0, 4, 2, 2, 1, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -6, 6, 6, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, 3, 3, 0, 0, 0, 0, -4, 4, -2, 2, 0, 0, 0, 0, -2, -2, -1, -1, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -6, 6, 6, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, -2, 4, 2, 0, 0, 0, 0, -3, 3, -3, 3, 0, 0, 0, 0, -2, -1, -2, -1, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, -4, -4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, -2, -2, 0, 0, 0, 0, 2, -2, 2, -2, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0},
+        {-3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {9, -9, 0, 0, -9, 9, 0, 0, 6, 3, 0, 0, -6, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, -6, 0, 0, 3, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 2, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {-6, 6, 0, 0, 6, -6, 0, 0, -3, -3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, 4, 0, 0, -2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -2, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, 0, 0, -1, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, -9, 0, 0, -9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 3, 0, 0, -6, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, -6, 0, 0, 3, -3, 0, 0, 4, 2, 0, 0, 2, 1, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -6, 6, 0, 0, 6, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, 4, 0, 0, -2, 2, 0, 0, -2, -2, 0, 0, -1, -1, 0, 0},
+        {9, 0, -9, 0, -9, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3, 0, -6, 0, -3, 0, 6, 0, -6, 0, 3, 0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 9, 0, -9, 0, -9, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3, 0, -6, 0, -3, 0, 6, 0, -6, 0, 3, 0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 2, 0, 2, 0, 1, 0},
+        {-27, 27, 27, -27, 27, -27, -27, 27, -18, -9, 18, 9, 18, 9, -18, -9, -18, 18, -9, 9, 18, -18, 9, -9, -18, 18, 18, -18, -9, 9, 9, -9, -12, -6, -6, -3, 12, 6, 6, 3, -12, -6, 12, 6, -6, -3, 6, 3, -12, 12, -6, 6, -6, 6, -3, 3, -8, -4, -4, -2, -4, -2, -2, -1},
+        {18, -18, -18, 18, -18, 18, 18, -18, 9, 9, -9, -9, -9, -9, 9, 9, 12, -12, 6, -6, -12, 12, -6, 6, 12, -12, -12, 12, 6, -6, -6, 6, 6, 6, 3, 3, -6, -6, -3, -3, 6, 6, -6, -6, 3, 3, -3, -3, 8, -8, 4, -4, 4, -4, 2, -2, 4, 4, 2, 2, 2, 2, 1, 1},
+        {-6, 0, 6, 0, 6, 0, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, -3, 0, 3, 0, 3, 0, -4, 0, 4, 0, -2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -2, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, -6, 0, 6, 0, 6, 0, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, -3, 0, 3, 0, 3, 0, -4, 0, 4, 0, -2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -2, 0, -1, 0, -1, 0},
+        {18, -18, -18, 18, -18, 18, 18, -18, 12, 6, -12, -6, -12, -6, 12, 6, 9, -9, 9, -9, -9, 9, -9, 9, 12, -12, -12, 12, 6, -6, -6, 6, 6, 3, 6, 3, -6, -3, -6, -3, 8, 4, -8, -4, 4, 2, -4, -2, 6, -6, 6, -6, 3, -3, 3, -3, 4, 2, 4, 2, 2, 1, 2, 1},
+        {-12, 12, 12, -12, 12, -12, -12, 12, -6, -6, 6, 6, 6, 6, -6, -6, -6, 6, -6, 6, 6, -6, 6, -6, -8, 8, 8, -8, -4, 4, 4, -4, -3, -3, -3, -3, 3, 3, 3, 3, -4, -4, 4, 4, -2, -2, 2, 2, -4, 4, -4, 4, -2, 2, -2, 2, -2, -2, -2, -2, -1, -1, -1, -1},
+        {2, 0, 0, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {-6, 6, 0, 0, 6, -6, 0, 0, -4, -2, 0, 0, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, -3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -1, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {4, -4, 0, 0, -4, 4, 0, 0, 2, 2, 0, 0, -2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -6, 6, 0, 0, 6, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, -2, 0, 0, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, -3, 3, 0, 0, -2, -1, 0, 0, -2, -1, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, -4, 0, 0, -4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, -2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 2, -2, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0},
+        {-6, 0, 6, 0, 6, 0, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, 0, -2, 0, 4, 0, 2, 0, -3, 0, 3, 0, -3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, -2, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, -6, 0, 6, 0, 6, 0, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, 0, -2, 0, 4, 0, 2, 0, -3, 0, 3, 0, -3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, -2, 0, -1, 0},
+        {18, -18, -18, 18, -18, 18, 18, -18, 12, 6, -12, -6, -12, -6, 12, 6, 12, -12, 6, -6, -12, 12, -6, 6, 9, -9, -9, 9, 9, -9, -9, 9, 8, 4, 4, 2, -8, -4, -4, -2, 6, 3, -6, -3, 6, 3, -6, -3, 6, -6, 3, -3, 6, -6, 3, -3, 4, 2, 2, 1, 4, 2, 2, 1},
+        {-12, 12, 12, -12, 12, -12, -12, 12, -6, -6, 6, 6, 6, 6, -6, -6, -8, 8, -4, 4, 8, -8, 4, -4, -6, 6, 6, -6, -6, 6, 6, -6, -4, -4, -2, -2, 4, 4, 2, 2, -3, -3, 3, 3, -3, -3, 3, 3, -4, 4, -2, 2, -4, 4, -2, 2, -2, -2, -1, -1, -2, -2, -1, -1},
+        {4, 0, -4, 0, -4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, -2, 0, -2, 0, 2, 0, -2, 0, 2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 4, 0, -4, 0, -4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, -2, 0, -2, 0, 2, 0, -2, 0, 2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0},
+        {-12, 12, 12, -12, 12, -12, -12, 12, -8, -4, 8, 4, 8, 4, -8, -4, -6, 6, -6, 6, 6, -6, 6, -6, -6, 6, 6, -6, -6, 6, 6, -6, -4, -2, -4, -2, 4, 2, 4, 2, -4, -2, 4, 2, -4, -2, 4, 2, -3, 3, -3, 3, -3, 3, -3, 3, -2, -1, -2, -1, -2, -1, -2, -1},
+        {8, -8, -8, 8, -8, 8, 8, -8, 4, 4, -4, -4, -4, -4, 4, 4, 4, -4, 4, -4, -4, 4, -4, 4, 4, -4, -4, 4, 4, -4, -4, 4, 2, 2, 2, 2, -2, -2, -2, -2, 2, 2, -2, -2, 2, 2, -2, -2, 2, -2, 2, -2, 2, -2, 2, -2, 1, 1, 1, 1, 1, 1, 1, 1}};
+
+    for (int i = 0; i < 64; ++i) { // C = A * X
+        C[i] = ValueT(0);
+#if 0
+    for (int j = 0; j < 64; j += 4) {
+      C[i] = fma(A[i][j], X[j], fma(A[i][j+1], X[j+1], fma(A[i][j+2], X[j+2], fma(A[i][j+3], X[j+3], C[i]))));
+    }
+#else
+        for (int j = 0; j < 64; j += 4) {
+            C[i] += A[i][j] * X[j] + A[i][j + 1] * X[j + 1] + A[i][j + 2] * X[j + 2] + A[i][j + 3] * X[j + 3];
+        }
+#endif
+    }
+}
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ typename TreeOrAccT::ValueType TricubicSampler<TreeOrAccT>::sample(const Vec3T<RealT> &xyz, const ValueT (&C)[64])
+{
+    ValueT zPow(1), sum(0);
+    for (int k = 0, n = 0; k < 4; ++k) {
+        ValueT yPow(1);
+        for (int j = 0; j < 4; ++j, n += 4) {
+#if 0
+            sum = fma( yPow, zPow * fma(xyz[0], fma(xyz[0], fma(xyz[0], C[n + 3], C[n + 2]), C[n + 1]), C[n]), sum);
+#else
+            sum += yPow * zPow * (C[n] + xyz[0] * (C[n + 1] + xyz[0] * (C[n + 2] + xyz[0] * C[n + 3])));
+#endif
+            yPow *= xyz[1];
+        }
+        zPow *= xyz[2];
+    }
+    return sum;
+}
+
+template<typename TreeOrAccT>
+class SampleFromVoxels<TreeOrAccT, 3, true> : public TricubicSampler<TreeOrAccT>
+{
+    using BaseT  = TricubicSampler<TreeOrAccT>;
+    using ValueT = typename TreeOrAccT::ValueType;
+    using CoordT = typename TreeOrAccT::CoordType;
+
+    mutable CoordT mPos;
+    mutable ValueT mC[64];
+
+    template<typename RealT, template<typename...> class Vec3T>
+    __hostdev__ void cache(Vec3T<RealT>& xyz) const;
+
+public:
+    /// @brief Construction from a Tree or ReadAccessor
+    __hostdev__ SampleFromVoxels(const TreeOrAccT& acc)
+        : BaseT(acc)
+    {
+    }
+
+    /// @note xyz is in index space space
+    template<typename RealT, template<typename...> class Vec3T>
+    inline __hostdev__ ValueT operator()(Vec3T<RealT> xyz) const;
+
+    // @brief Return value at the coordinate @a ijk in index space space
+    __hostdev__ ValueT operator()(const CoordT &ijk) const {return BaseT::mAcc.getValue(ijk);}
+
+}; // SampleFromVoxels<TreeOrAccT, 3, true>
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 3, true>::operator()(Vec3T<RealT> xyz) const
+{
+    this->cache(xyz);
+    return BaseT::sample(xyz, mC);
+}
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ void SampleFromVoxels<TreeOrAccT, 3, true>::cache(Vec3T<RealT>& xyz) const
+{
+    CoordT ijk = Floor<CoordT>(xyz);
+    if (ijk != mPos) {
+        mPos = ijk;
+        BaseT::stencil(ijk, mC);
+    }
+}
+
+template<typename TreeOrAccT>
+class SampleFromVoxels<TreeOrAccT, 3, false> : public TricubicSampler<TreeOrAccT>
+{
+    using BaseT  = TricubicSampler<TreeOrAccT>;
+    using ValueT = typename TreeOrAccT::ValueType;
+    using CoordT = typename TreeOrAccT::CoordType;
+
+public:
+    /// @brief Construction from a Tree or ReadAccessor
+    __hostdev__ SampleFromVoxels(const TreeOrAccT& acc)
+        : BaseT(acc)
+    {
+    }
+
+    /// @note xyz is in index space space
+    template<typename RealT, template<typename...> class Vec3T>
+    inline __hostdev__ ValueT operator()(Vec3T<RealT> xyz) const;
+
+    __hostdev__ ValueT operator()(const CoordT &ijk) const {return BaseT::mAcc.getValue(ijk);}
+
+}; // SampleFromVoxels<TreeOrAccT, 3, true>
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 3, false>::operator()(Vec3T<RealT> xyz) const
+{
+    ValueT C[64];
+    CoordT ijk = Floor<CoordT>(xyz);
+    BaseT::stencil(ijk, C);
+    return BaseT::sample(xyz, C);
+}
+
+}// namespace math
+
+template<int Order, typename TreeOrAccT, bool UseCache = true>
+[[deprecated("Use nanovdb::math::createSampler instead")]]
+__hostdev__ math::SampleFromVoxels<TreeOrAccT, Order, UseCache> createSampler(const TreeOrAccT& acc)
+{
+    return math::SampleFromVoxels<TreeOrAccT, Order, UseCache>(acc);
+}
+
+} // namespace nanovdb
+
+#endif // NANOVDB_SAMPLE_FROM_VOXELS_H_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/math/Stencils.h b/nanovdb/nanovdb/math/Stencils.h
new file mode 100644
index 0000000000..cc935bf827
--- /dev/null
+++ b/nanovdb/nanovdb/math/Stencils.h
@@ -0,0 +1,1032 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+//
+/// @author Ken Museth
+///
+/// @date  April 9, 2021
+///
+/// @file Stencils.h
+///
+/// @brief Defines various finite-difference stencils that allow for the
+///        computation of gradients of order 1 to 5, mean curvatures,
+///        gaussian curvatures, principal curvatures, tri-linear interpolation,
+///        zero-crossing, laplacian, and closest point transform.
+
+#ifndef NANOVDB_MATH_STENCILS_HAS_BEEN_INCLUDED
+#define NANOVDB_MATH_STENCILS_HAS_BEEN_INCLUDED
+
+#include <nanovdb/math/Math.h>// for __hostdev__, Vec3, Min, Max, Pow2, Pow3, Pow4
+
+namespace nanovdb {
+
+namespace math {
+
+// ---------------------------- WENO5 ----------------------------
+
+/// @brief Implementation of nominally fifth-order finite-difference WENO
+/// @details This function returns the numerical flux.  See "High Order Finite Difference and
+/// Finite Volume WENO Schemes and Discontinuous Galerkin Methods for CFD" - Chi-Wang Shu
+/// ICASE Report No 2001-11 (page 6).  Also see ICASE No 97-65 for a more complete reference
+/// (Shu, 1997).
+/// Given v1 = f(x-2dx), v2 = f(x-dx), v3 = f(x), v4 = f(x+dx) and v5 = f(x+2dx),
+/// return an interpolated value f(x+dx/2) with the special property that
+/// ( f(x+dx/2) - f(x-dx/2) ) / dx  = df/dx (x) + error,
+/// where the error is fifth-order in smooth regions: O(dx) <= error <=O(dx^5)
+template<typename ValueType, typename RealT = ValueType>
+__hostdev__ inline ValueType
+WENO5(const ValueType& v1,
+      const ValueType& v2,
+      const ValueType& v3,
+      const ValueType& v4,
+      const ValueType& v5,
+      RealT scale2 = 1.0)// openvdb uses scale2 = 0.01
+{
+    static const RealT C = 13.0 / 12.0;
+    // WENO is formulated for non-dimensional equations, here the optional scale2
+    // is a reference value (squared) for the function being interpolated.  For
+    // example if 'v' is of order 1000, then scale2 = 10^6 is ok.  But in practice
+    // leave scale2 = 1.
+    const RealT eps = RealT(1.0e-6) * scale2;
+    // {\tilde \omega_k} = \gamma_k / ( \beta_k + \epsilon)^2 in Shu's ICASE report)
+    const RealT A1 = RealT(0.1)/Pow2(C*Pow2(v1-2*v2+v3)+RealT(0.25)*Pow2(v1-4*v2+3*v3)+eps),
+                A2 = RealT(0.6)/Pow2(C*Pow2(v2-2*v3+v4)+RealT(0.25)*Pow2(v2-v4)+eps),
+                A3 = RealT(0.3)/Pow2(C*Pow2(v3-2*v4+v5)+RealT(0.25)*Pow2(3*v3-4*v4+v5)+eps);
+
+    return static_cast<ValueType>((A1*(2*v1 - 7*v2 + 11*v3) +
+                                   A2*(5*v3 -   v2 +  2*v4) +
+                                   A3*(2*v3 + 5*v4 -    v5))/(6*(A1+A2+A3)));
+}
+
+// ---------------------------- GodunovsNormSqrd ----------------------------
+
+template <typename RealT>
+__hostdev__ inline RealT
+GodunovsNormSqrd(bool isOutside,
+                 RealT dP_xm, RealT dP_xp,
+                 RealT dP_ym, RealT dP_yp,
+                 RealT dP_zm, RealT dP_zp)
+{
+    RealT dPLen2;
+    if (isOutside) { // outside
+        dPLen2  = Max(Pow2(Max(dP_xm, RealT(0))), Pow2(Min(dP_xp, RealT(0)))); // (dP/dx)2
+        dPLen2 += Max(Pow2(Max(dP_ym, RealT(0))), Pow2(Min(dP_yp, RealT(0)))); // (dP/dy)2
+        dPLen2 += Max(Pow2(Max(dP_zm, RealT(0))), Pow2(Min(dP_zp, RealT(0)))); // (dP/dz)2
+    } else { // inside
+        dPLen2  = Max(Pow2(Min(dP_xm, RealT(0))), Pow2(Max(dP_xp, RealT(0)))); // (dP/dx)2
+        dPLen2 += Max(Pow2(Min(dP_ym, RealT(0))), Pow2(Max(dP_yp, RealT(0)))); // (dP/dy)2
+        dPLen2 += Max(Pow2(Min(dP_zm, RealT(0))), Pow2(Max(dP_zp, RealT(0)))); // (dP/dz)2
+    }
+    return dPLen2; // |\nabla\phi|^2
+}
+
+template<typename RealT>
+__hostdev__ inline RealT
+GodunovsNormSqrd(bool isOutside,
+                 const Vec3<RealT>& gradient_m,
+                 const Vec3<RealT>& gradient_p)
+{
+    return GodunovsNormSqrd<RealT>(isOutside,
+                                   gradient_m[0], gradient_p[0],
+                                   gradient_m[1], gradient_p[1],
+                                   gradient_m[2], gradient_p[2]);
+}
+
+// ---------------------------- BaseStencil ----------------------------
+
+// BaseStencil uses curiously recurring template pattern (CRTP)
+template<typename DerivedType, int SIZE, typename GridT>
+class BaseStencil
+{
+public:
+    using ValueType = typename GridT::ValueType;
+    using GridType  = GridT;
+    using TreeType  = typename GridT::TreeType;
+    using AccessorType = typename GridT::AccessorType;// ReadAccessor<ValueType>;
+
+    /// @brief Initialize the stencil buffer with the values of voxel (i, j, k)
+    /// and its neighbors.
+    /// @param ijk Index coordinates of stencil center
+    __hostdev__ inline void moveTo(const Coord& ijk)
+    {
+        mCenter = ijk;
+        mValues[0] = mAcc.getValue(ijk);
+        static_cast<DerivedType&>(*this).init(mCenter);
+    }
+
+    /// @brief Initialize the stencil buffer with the values of voxel (i, j, k)
+    /// and its neighbors. The method also takes a value of the center
+    /// element of the stencil, assuming it is already known.
+    /// @param ijk Index coordinates of stencil center
+    /// @param centerValue Value of the center element of the stencil
+    __hostdev__ inline void moveTo(const Coord& ijk, const ValueType& centerValue)
+    {
+        mCenter = ijk;
+        mValues[0] = centerValue;
+        static_cast<DerivedType&>(*this).init(mCenter);
+    }
+
+    /// @brief Initialize the stencil buffer with the values of voxel
+    /// (x, y, z) and its neighbors.
+    ///
+    /// @note This version is slightly faster than the one above, since
+    /// the center voxel's value is read directly from the iterator.
+    template<typename IterType>
+    __hostdev__ inline void moveTo(const IterType& iter)
+    {
+        mCenter = iter.getCoord();
+        mValues[0] = *iter;
+        static_cast<DerivedType&>(*this).init(mCenter);
+    }
+
+    /// @brief Initialize the stencil buffer with the values of voxel (x, y, z)
+    /// and its neighbors.
+    /// @param xyz Floating point voxel coordinates of stencil center
+    /// @details This method will check to see if it is necessary to
+    /// update the stencil based on the cached index coordinates of
+    /// the center point.
+    template<typename RealType>
+    __hostdev__ inline void moveTo(const Vec3<RealType>& xyz)
+    {
+        Coord ijk = RoundDown(xyz);
+        if (ijk != mCenter) this->moveTo(ijk);
+    }
+
+    /// @brief Return the value from the stencil buffer with linear
+    /// offset pos.
+    ///
+    /// @note The default (@a pos = 0) corresponds to the first element
+    /// which is typically the center point of the stencil.
+    __hostdev__ inline const ValueType& getValue(unsigned int pos = 0) const
+    {
+        NANOVDB_ASSERT(pos < SIZE);
+        return mValues[pos];
+    }
+
+    /// @brief Return the value at the specified location relative to the center of the stencil
+    template<int i, int j, int k>
+    __hostdev__ inline const ValueType& getValue() const
+    {
+        return mValues[static_cast<const DerivedType&>(*this).template pos<i,j,k>()];
+    }
+
+    /// @brief Set the value at the specified location relative to the center of the stencil
+    template<int i, int j, int k>
+    __hostdev__ inline void setValue(const ValueType& value)
+    {
+        mValues[static_cast<const DerivedType&>(*this).template pos<i,j,k>()] = value;
+    }
+
+    /// @brief Return the size of the stencil buffer.
+    __hostdev__ static int size() { return SIZE; }
+
+    /// @brief Return the mean value of the current stencil.
+    __hostdev__ inline ValueType mean() const
+    {
+        ValueType sum = 0.0;
+        for (int i = 0; i < SIZE; ++i) sum += mValues[i];
+        return sum / ValueType(SIZE);
+    }
+
+    /// @brief Return the smallest value in the stencil buffer.
+    __hostdev__ inline ValueType min() const
+    {
+        ValueType v = mValues[0];
+        for (int i=1; i<SIZE; ++i) {
+            if (mValues[i] < v) v = mValues[i];
+        }
+        return v;
+    }
+
+    /// @brief Return the largest value in the stencil buffer.
+    __hostdev__ inline ValueType max() const
+    {
+        ValueType v = mValues[0];
+        for (int i=1; i<SIZE; ++i) {
+            if (mValues[i] > v) v = mValues[i];
+        }
+        return v;
+    }
+
+    /// @brief Return the coordinates of the center point of the stencil.
+    __hostdev__ inline const Coord& getCenterCoord() const { return mCenter; }
+
+    /// @brief Return the value at the center of the stencil
+    __hostdev__ inline const ValueType& getCenterValue() const { return mValues[0]; }
+
+    /// @brief Return true if the center of the stencil intersects the
+    /// iso-contour specified by the isoValue
+    __hostdev__ inline bool intersects(const ValueType &isoValue = ValueType(0) ) const
+    {
+        const bool less = this->getValue< 0, 0, 0>() < isoValue;
+        return (less  ^  (this->getValue<-1, 0, 0>() < isoValue)) ||
+               (less  ^  (this->getValue< 1, 0, 0>() < isoValue)) ||
+               (less  ^  (this->getValue< 0,-1, 0>() < isoValue)) ||
+               (less  ^  (this->getValue< 0, 1, 0>() < isoValue)) ||
+               (less  ^  (this->getValue< 0, 0,-1>() < isoValue)) ||
+               (less  ^  (this->getValue< 0, 0, 1>() < isoValue))  ;
+    }
+    struct Mask {
+        uint8_t bits;
+        __hostdev__ Mask() : bits(0u) {}
+        __hostdev__ void set(int i) { bits |= (1 << i); }
+        __hostdev__ bool test(int i) const { return bits & (1 << i); }
+        __hostdev__ bool any() const  { return bits >  0u; }
+        __hostdev__ bool all() const  { return bits == 255u; }
+        __hostdev__ bool none() const { return bits == 0u; }
+        __hostdev__ int count() const { return util::countOn(bits); }
+    };// Mask
+
+    /// @brief Return true a bit-mask where the 6 lower bits indicates if the
+    /// center of the stencil intersects the iso-contour specified by the isoValue.
+    ///
+    /// @note There are 2^6 = 64 different possible cases, including no intersections!
+    ///
+    /// @details The ordering of bit mask is ( -x, +x, -y, +y, -z, +z ), so to
+    /// check if there is an intersection in -y use (mask & (1u<<2)) where mask is
+    /// ther return value from this function. To check if there are any
+    /// intersections use mask!=0u, and for no intersections use mask==0u.
+    /// To count the number of intersections use __builtin_popcount(mask).
+    __hostdev__ inline Mask intersectionMask(ValueType isoValue = ValueType(0)) const
+    {
+        Mask mask;
+        const bool less = this->getValue< 0, 0, 0>() < isoValue;
+        if (less ^ (this->getValue<-1, 0, 0>() < isoValue)) mask.set(0);// |=  1u;
+        if (less ^ (this->getValue< 1, 0, 0>() < isoValue)) mask.set(1);// |=  2u;
+        if (less ^ (this->getValue< 0,-1, 0>() < isoValue)) mask.set(2);// |=  4u;
+        if (less ^ (this->getValue< 0, 1, 0>() < isoValue)) mask.set(3);// |=  8u;
+        if (less ^ (this->getValue< 0, 0,-1>() < isoValue)) mask.set(4);// |= 16u;
+        if (less ^ (this->getValue< 0, 0, 1>() < isoValue)) mask.set(5);// |= 32u;
+        return mask;
+    }
+
+    /// @brief Return a const reference to the grid from which this
+    /// stencil was constructed.
+    __hostdev__ inline const GridType& grid() const { return *mGrid; }
+
+    /// @brief Return a const reference to the ValueAccessor
+    /// associated with this Stencil.
+    __hostdev__ inline const AccessorType& accessor() const { return mAcc; }
+
+protected:
+    // Constructor is protected to prevent direct instantiation.
+    __hostdev__ BaseStencil(const GridType& grid)
+        : mGrid(&grid)
+        , mAcc(grid)
+        , mCenter(Coord::max())
+    {
+    }
+
+    const GridType* mGrid;
+    AccessorType    mAcc;
+    ValueType       mValues[SIZE];
+    Coord           mCenter;
+
+}; // BaseStencil class
+
+
+// ---------------------------- BoxStencil ----------------------------
+
+
+namespace { // anonymous namespace for stencil-layout map
+
+    // the eight point box stencil
+    template<int i, int j, int k> struct BoxPt {};
+    template<> struct BoxPt< 0, 0, 0> { enum { idx = 0 }; };
+    template<> struct BoxPt< 0, 0, 1> { enum { idx = 1 }; };
+    template<> struct BoxPt< 0, 1, 1> { enum { idx = 2 }; };
+    template<> struct BoxPt< 0, 1, 0> { enum { idx = 3 }; };
+    template<> struct BoxPt< 1, 0, 0> { enum { idx = 4 }; };
+    template<> struct BoxPt< 1, 0, 1> { enum { idx = 5 }; };
+    template<> struct BoxPt< 1, 1, 1> { enum { idx = 6 }; };
+    template<> struct BoxPt< 1, 1, 0> { enum { idx = 7 }; };
+
+}
+
+template<typename GridT>
+class BoxStencil: public BaseStencil<BoxStencil<GridT>, 8, GridT>
+{
+    using SelfT     = BoxStencil<GridT>;
+    using BaseType  = BaseStencil<SelfT, 8, GridT>;
+public:
+    using GridType  = GridT;
+    using TreeType  = typename GridT::TreeType;
+    using ValueType = typename GridT::ValueType;
+
+    static constexpr int SIZE = 8;
+
+    __hostdev__ BoxStencil(const GridType& grid) : BaseType(grid) {}
+
+    /// Return linear offset for the specified stencil point relative to its center
+    template<int i, int j, int k>
+    __hostdev__ unsigned int pos() const { return BoxPt<i,j,k>::idx; }
+
+     /// @brief Return true if the center of the stencil intersects the
+    /// iso-contour specified by the isoValue
+    __hostdev__ inline bool intersects(ValueType isoValue = ValueType(0)) const
+    {
+        const bool less = mValues[0] < isoValue;
+        return (less  ^  (mValues[1] < isoValue)) ||
+               (less  ^  (mValues[2] < isoValue)) ||
+               (less  ^  (mValues[3] < isoValue)) ||
+               (less  ^  (mValues[4] < isoValue)) ||
+               (less  ^  (mValues[5] < isoValue)) ||
+               (less  ^  (mValues[6] < isoValue)) ||
+               (less  ^  (mValues[7] < isoValue))  ;
+    }
+
+    /// @brief Return the trilinear interpolation at the normalized position.
+    /// @param xyz Floating point coordinate position. Index space and NOT world space.
+    /// @warning It is assumed that the stencil has already been moved
+    /// to the relevant voxel position, e.g. using moveTo(xyz).
+    /// @note Trilinear interpolation kernal reads as:
+    ///       v000 (1-u)(1-v)(1-w) + v001 (1-u)(1-v)w + v010 (1-u)v(1-w) + v011 (1-u)vw
+    ///     + v100 u(1-v)(1-w)     + v101 u(1-v)w     + v110 uv(1-w)     + v111 uvw
+    __hostdev__ inline ValueType interpolation(const Vec3<ValueType>& xyz) const
+    {
+        const ValueType u = xyz[0] - mCenter[0];
+        const ValueType v = xyz[1] - mCenter[1];
+        const ValueType w = xyz[2] - mCenter[2];
+
+        NANOVDB_ASSERT(u>=0 && u<=1);
+        NANOVDB_ASSERT(v>=0 && v<=1);
+        NANOVDB_ASSERT(w>=0 && w<=1);
+
+        ValueType V = BaseType::template getValue<0,0,0>();
+        ValueType A = V + (BaseType::template getValue<0,0,1>() - V) * w;
+        V = BaseType::template getValue< 0, 1, 0>();
+        ValueType B = V + (BaseType::template getValue<0,1,1>() - V) * w;
+        ValueType C = A + (B - A) * v;
+
+        V = BaseType::template getValue<1,0,0>();
+        A = V + (BaseType::template getValue<1,0,1>() - V) * w;
+        V = BaseType::template getValue<1,1,0>();
+        B = V + (BaseType::template getValue<1,1,1>() - V) * w;
+        ValueType D = A + (B - A) * v;
+
+        return C + (D - C) * u;
+    }
+
+    /// @brief Return the gradient in world space of the trilinear interpolation kernel.
+    /// @param xyz Floating point coordinate position.
+    /// @warning It is assumed that the stencil has already been moved
+    /// to the relevant voxel position, e.g. using moveTo(xyz).
+    /// @note Computed as partial derivatives of the trilinear interpolation kernel:
+    ///       v000 (1-u)(1-v)(1-w) + v001 (1-u)(1-v)w + v010 (1-u)v(1-w) + v011 (1-u)vw
+    ///     + v100 u(1-v)(1-w)     + v101 u(1-v)w     + v110 uv(1-w)     + v111 uvw
+    __hostdev__ inline Vec3<ValueType> gradient(const Vec3<ValueType>& xyz) const
+    {
+        const ValueType u = xyz[0] - mCenter[0];
+        const ValueType v = xyz[1] - mCenter[1];
+        const ValueType w = xyz[2] - mCenter[2];
+
+        NANOVDB_ASSERT(u>=0 && u<=1);
+        NANOVDB_ASSERT(v>=0 && v<=1);
+        NANOVDB_ASSERT(w>=0 && w<=1);
+
+        ValueType D[4]={BaseType::template getValue<0,0,1>()-BaseType::template getValue<0,0,0>(),
+                        BaseType::template getValue<0,1,1>()-BaseType::template getValue<0,1,0>(),
+                        BaseType::template getValue<1,0,1>()-BaseType::template getValue<1,0,0>(),
+                        BaseType::template getValue<1,1,1>()-BaseType::template getValue<1,1,0>()};
+
+        // Z component
+        ValueType A = D[0] + (D[1]- D[0]) * v;
+        ValueType B = D[2] + (D[3]- D[2]) * v;
+        Vec3<ValueType> grad(0, 0, A + (B - A) * u);
+
+        D[0] = BaseType::template getValue<0,0,0>() + D[0] * w;
+        D[1] = BaseType::template getValue<0,1,0>() + D[1] * w;
+        D[2] = BaseType::template getValue<1,0,0>() + D[2] * w;
+        D[3] = BaseType::template getValue<1,1,0>() + D[3] * w;
+
+        // X component
+        A = D[0] + (D[1] - D[0]) * v;
+        B = D[2] + (D[3] - D[2]) * v;
+
+        grad[0] = B - A;
+
+        // Y component
+        A = D[1] - D[0];
+        B = D[3] - D[2];
+
+        grad[1] = A + (B - A) * u;
+
+        return BaseType::mGrid->map().applyIJT(grad);
+    }
+
+private:
+    __hostdev__ inline void init(const Coord& ijk)
+    {
+        mValues[ 1] = mAcc.getValue(ijk.offsetBy( 0, 0, 1));
+        mValues[ 2] = mAcc.getValue(ijk.offsetBy( 0, 1, 1));
+        mValues[ 3] = mAcc.getValue(ijk.offsetBy( 0, 1, 0));
+        mValues[ 4] = mAcc.getValue(ijk.offsetBy( 1, 0, 0));
+        mValues[ 5] = mAcc.getValue(ijk.offsetBy( 1, 0, 1));
+        mValues[ 6] = mAcc.getValue(ijk.offsetBy( 1, 1, 1));
+        mValues[ 7] = mAcc.getValue(ijk.offsetBy( 1, 1, 0));
+    }
+
+    template<typename, int, typename> friend class BaseStencil; // allow base class to call init()
+    using BaseType::mAcc;
+    using BaseType::mValues;
+    using BaseType::mCenter;
+};// BoxStencil class
+
+
+// ---------------------------- GradStencil ----------------------------
+
+namespace { // anonymous namespace for stencil-layout map
+
+    template<int i, int j, int k> struct GradPt {};
+    template<> struct GradPt< 0, 0, 0> { enum { idx = 0 }; };
+    template<> struct GradPt< 1, 0, 0> { enum { idx = 2 }; };
+    template<> struct GradPt< 0, 1, 0> { enum { idx = 4 }; };
+    template<> struct GradPt< 0, 0, 1> { enum { idx = 6 }; };
+    template<> struct GradPt<-1, 0, 0> { enum { idx = 1 }; };
+    template<> struct GradPt< 0,-1, 0> { enum { idx = 3 }; };
+    template<> struct GradPt< 0, 0,-1> { enum { idx = 5 }; };
+}
+
+/// This is a simple 7-point nearest neighbor stencil that supports
+/// gradient by second-order central differencing, first-order upwinding,
+/// Laplacian, closest-point transform and zero-crossing test.
+///
+/// @note For optimal random access performance this class
+/// includes its own grid accessor.
+template<typename GridT>
+class GradStencil : public BaseStencil<GradStencil<GridT>, 7, GridT>
+{
+    using SelfT     = GradStencil<GridT>;
+    using BaseType  = BaseStencil<SelfT, 7, GridT>;
+public:
+    using GridType  = GridT;
+    using TreeType  = typename GridT::TreeType;
+    using ValueType = typename GridT::ValueType;
+
+    static constexpr int SIZE = 7;
+
+    __hostdev__ GradStencil(const GridType& grid)
+        : BaseType(grid)
+        , mInv2Dx(ValueType(0.5 / grid.voxelSize()[0]))
+        , mInvDx2(ValueType(4.0 * mInv2Dx * mInv2Dx))
+    {
+    }
+
+    __hostdev__ GradStencil(const GridType& grid, double dx)
+        : BaseType(grid)
+        , mInv2Dx(ValueType(0.5 / dx))
+        , mInvDx2(ValueType(4.0 * mInv2Dx * mInv2Dx))
+    {
+    }
+
+    /// @brief Return the norm square of the single-sided upwind gradient
+    /// (computed via Godunov's scheme) at the previously buffered location.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline ValueType normSqGrad() const
+    {
+        return mInvDx2 * GodunovsNormSqrd(mValues[0] > ValueType(0),
+                                          mValues[0] - mValues[1],
+                                          mValues[2] - mValues[0],
+                                          mValues[0] - mValues[3],
+                                          mValues[4] - mValues[0],
+                                          mValues[0] - mValues[5],
+                                          mValues[6] - mValues[0]);
+    }
+
+    /// @brief Return the gradient computed at the previously buffered
+    /// location by second order central differencing.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline Vec3<ValueType> gradient() const
+    {
+        return Vec3<ValueType>(mValues[2] - mValues[1],
+                               mValues[4] - mValues[3],
+                               mValues[6] - mValues[5])*mInv2Dx;
+    }
+    /// @brief Return the first-order upwind gradient corresponding to the direction V.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline Vec3<ValueType> gradient(const Vec3<ValueType>& V) const
+    {
+        return Vec3<ValueType>(
+               V[0]>0 ? mValues[0] - mValues[1] : mValues[2] - mValues[0],
+               V[1]>0 ? mValues[0] - mValues[3] : mValues[4] - mValues[0],
+               V[2]>0 ? mValues[0] - mValues[5] : mValues[6] - mValues[0])*2*mInv2Dx;
+    }
+
+    /// Return the Laplacian computed at the previously buffered
+    /// location by second-order central differencing.
+    __hostdev__ inline ValueType laplacian() const
+    {
+        return mInvDx2 * (mValues[1] + mValues[2] +
+                          mValues[3] + mValues[4] +
+                          mValues[5] + mValues[6] - 6*mValues[0]);
+    }
+
+    /// Return @c true if the sign of the value at the center point of the stencil
+    /// is different from the signs of any of its six nearest neighbors.
+    __hostdev__ inline bool zeroCrossing() const
+    {
+        return (mValues[0]>0 ? (mValues[1]<0 || mValues[2]<0 || mValues[3]<0 || mValues[4]<0 || mValues[5]<0 || mValues[6]<0)
+                             : (mValues[1]>0 || mValues[2]>0 || mValues[3]>0 || mValues[4]>0 || mValues[5]>0 || mValues[6]>0));
+    }
+
+    /// @brief Compute the closest-point transform to a level set.
+    /// @return the closest point in index space to the surface
+    /// from which the level set was derived.
+    ///
+    /// @note This method assumes that the grid represents a level set
+    /// with distances in world units and a simple affine transfrom
+    /// with uniform scaling.
+    __hostdev__ inline Vec3<ValueType> cpt()
+    {
+        const Coord& ijk = BaseType::getCenterCoord();
+        const ValueType d = ValueType(mValues[0] * 0.5 * mInvDx2); // distance in voxels / (2dx^2)
+        const auto value = Vec3<ValueType>(ijk[0] - d*(mValues[2] - mValues[1]),
+                                           ijk[1] - d*(mValues[4] - mValues[3]),
+                                           ijk[2] - d*(mValues[6] - mValues[5]));
+        return value;
+    }
+
+    /// Return linear offset for the specified stencil point relative to its center
+    template<int i, int j, int k>
+    __hostdev__ unsigned int pos() const { return GradPt<i,j,k>::idx; }
+
+private:
+
+    __hostdev__ inline void init(const Coord& ijk)
+    {
+        mValues[ 1] = mAcc.getValue(ijk.offsetBy(-1, 0, 0));
+        mValues[ 2] = mAcc.getValue(ijk.offsetBy( 1, 0, 0));
+
+        mValues[ 3] = mAcc.getValue(ijk.offsetBy( 0,-1, 0));
+        mValues[ 4] = mAcc.getValue(ijk.offsetBy( 0, 1, 0));
+
+        mValues[ 5] = mAcc.getValue(ijk.offsetBy( 0, 0,-1));
+        mValues[ 6] = mAcc.getValue(ijk.offsetBy( 0, 0, 1));
+    }
+
+    template<typename, int, typename> friend class BaseStencil; // allow base class to call init()
+    using BaseType::mAcc;
+    using BaseType::mValues;
+    const ValueType mInv2Dx, mInvDx2;
+}; // GradStencil class
+
+
+// ---------------------------- WenoStencil ----------------------------
+
+namespace { // anonymous namespace for stencil-layout map
+
+    template<int i, int j, int k> struct WenoPt {};
+    template<> struct WenoPt< 0, 0, 0> { enum { idx = 0 }; };
+
+    template<> struct WenoPt<-3, 0, 0> { enum { idx = 1 }; };
+    template<> struct WenoPt<-2, 0, 0> { enum { idx = 2 }; };
+    template<> struct WenoPt<-1, 0, 0> { enum { idx = 3 }; };
+    template<> struct WenoPt< 1, 0, 0> { enum { idx = 4 }; };
+    template<> struct WenoPt< 2, 0, 0> { enum { idx = 5 }; };
+    template<> struct WenoPt< 3, 0, 0> { enum { idx = 6 }; };
+
+    template<> struct WenoPt< 0,-3, 0> { enum { idx = 7 }; };
+    template<> struct WenoPt< 0,-2, 0> { enum { idx = 8 }; };
+    template<> struct WenoPt< 0,-1, 0> { enum { idx = 9 }; };
+    template<> struct WenoPt< 0, 1, 0> { enum { idx =10 }; };
+    template<> struct WenoPt< 0, 2, 0> { enum { idx =11 }; };
+    template<> struct WenoPt< 0, 3, 0> { enum { idx =12 }; };
+
+    template<> struct WenoPt< 0, 0,-3> { enum { idx =13 }; };
+    template<> struct WenoPt< 0, 0,-2> { enum { idx =14 }; };
+    template<> struct WenoPt< 0, 0,-1> { enum { idx =15 }; };
+    template<> struct WenoPt< 0, 0, 1> { enum { idx =16 }; };
+    template<> struct WenoPt< 0, 0, 2> { enum { idx =17 }; };
+    template<> struct WenoPt< 0, 0, 3> { enum { idx =18 }; };
+
+}
+
+/// @brief This is a special 19-point stencil that supports optimal fifth-order WENO
+/// upwinding, second-order central differencing, Laplacian, and zero-crossing test.
+///
+/// @note For optimal random access performance this class
+/// includes its own grid accessor.
+template<typename GridT, typename RealT = typename GridT::ValueType>
+class WenoStencil: public BaseStencil<WenoStencil<GridT>, 19, GridT>
+{
+    using SelfT     = WenoStencil<GridT>;
+    using BaseType  = BaseStencil<SelfT, 19, GridT>;
+public:
+    using GridType  = GridT;
+    using TreeType  = typename GridT::TreeType;
+    using ValueType = typename GridT::ValueType;
+
+    static constexpr int SIZE = 19;
+
+    __hostdev__ WenoStencil(const GridType& grid)
+        : BaseType(grid)
+        , mDx2(ValueType(Pow2(grid.voxelSize()[0])))
+        , mInv2Dx(ValueType(0.5 / grid.voxelSize()[0]))
+        , mInvDx2(ValueType(1.0 / mDx2))
+    {
+    }
+
+    __hostdev__ WenoStencil(const GridType& grid, double dx)
+        : BaseType(grid)
+        , mDx2(ValueType(dx * dx))
+        , mInv2Dx(ValueType(0.5 / dx))
+        , mInvDx2(ValueType(1.0 / mDx2))
+    {
+    }
+
+    /// @brief Return the norm-square of the WENO upwind gradient (computed via
+    /// WENO upwinding and Godunov's scheme) at the previously buffered location.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline ValueType normSqGrad(ValueType isoValue = ValueType(0)) const
+    {
+        const ValueType* v = mValues;
+        const RealT
+            dP_xm = WENO5<RealT>(v[ 2]-v[ 1],v[ 3]-v[ 2],v[ 0]-v[ 3],v[ 4]-v[ 0],v[ 5]-v[ 4],mDx2),
+            dP_xp = WENO5<RealT>(v[ 6]-v[ 5],v[ 5]-v[ 4],v[ 4]-v[ 0],v[ 0]-v[ 3],v[ 3]-v[ 2],mDx2),
+            dP_ym = WENO5<RealT>(v[ 8]-v[ 7],v[ 9]-v[ 8],v[ 0]-v[ 9],v[10]-v[ 0],v[11]-v[10],mDx2),
+            dP_yp = WENO5<RealT>(v[12]-v[11],v[11]-v[10],v[10]-v[ 0],v[ 0]-v[ 9],v[ 9]-v[ 8],mDx2),
+            dP_zm = WENO5<RealT>(v[14]-v[13],v[15]-v[14],v[ 0]-v[15],v[16]-v[ 0],v[17]-v[16],mDx2),
+            dP_zp = WENO5<RealT>(v[18]-v[17],v[17]-v[16],v[16]-v[ 0],v[ 0]-v[15],v[15]-v[14],mDx2);
+        return mInvDx2*static_cast<ValueType>(
+            GodunovsNormSqrd(v[0]>isoValue, dP_xm, dP_xp, dP_ym, dP_yp, dP_zm, dP_zp));
+    }
+
+    /// Return the optimal fifth-order upwind gradient corresponding to the
+    /// direction V.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline Vec3<ValueType> gradient(const Vec3<ValueType>& V) const
+    {
+        const ValueType* v = mValues;
+        return 2*mInv2Dx * Vec3<ValueType>(
+            V[0]>0 ? WENO5<RealT>(v[ 2]-v[ 1],v[ 3]-v[ 2],v[ 0]-v[ 3], v[ 4]-v[ 0],v[ 5]-v[ 4],mDx2)
+                   : WENO5<RealT>(v[ 6]-v[ 5],v[ 5]-v[ 4],v[ 4]-v[ 0], v[ 0]-v[ 3],v[ 3]-v[ 2],mDx2),
+            V[1]>0 ? WENO5<RealT>(v[ 8]-v[ 7],v[ 9]-v[ 8],v[ 0]-v[ 9], v[10]-v[ 0],v[11]-v[10],mDx2)
+                   : WENO5<RealT>(v[12]-v[11],v[11]-v[10],v[10]-v[ 0], v[ 0]-v[ 9],v[ 9]-v[ 8],mDx2),
+            V[2]>0 ? WENO5<RealT>(v[14]-v[13],v[15]-v[14],v[ 0]-v[15], v[16]-v[ 0],v[17]-v[16],mDx2)
+                   : WENO5<RealT>(v[18]-v[17],v[17]-v[16],v[16]-v[ 0], v[ 0]-v[15],v[15]-v[14],mDx2));
+    }
+    /// Return the gradient computed at the previously buffered
+    /// location by second-order central differencing.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline Vec3<ValueType> gradient() const
+    {
+        return mInv2Dx * Vec3<ValueType>(mValues[ 4] - mValues[ 3],
+                                         mValues[10] - mValues[ 9],
+                                         mValues[16] - mValues[15]);
+    }
+
+    /// Return the Laplacian computed at the previously buffered
+    /// location by second-order central differencing.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline ValueType laplacian() const
+    {
+        return mInvDx2 * (
+            mValues[ 3] + mValues[ 4] +
+            mValues[ 9] + mValues[10] +
+            mValues[15] + mValues[16] - 6*mValues[0]);
+    }
+
+    /// Return @c true if the sign of the value at the center point of the stencil
+    /// differs from the sign of any of its six nearest neighbors
+    __hostdev__ inline bool zeroCrossing() const
+    {
+        const ValueType* v = mValues;
+        return (v[ 0]>0 ? (v[ 3]<0 || v[ 4]<0 || v[ 9]<0 || v[10]<0 || v[15]<0 || v[16]<0)
+                        : (v[ 3]>0 || v[ 4]>0 || v[ 9]>0 || v[10]>0 || v[15]>0 || v[16]>0));
+    }
+
+    /// Return linear offset for the specified stencil point relative to its center
+    template<int i, int j, int k>
+    __hostdev__ unsigned int pos() const { return WenoPt<i,j,k>::idx; }
+
+private:
+    __hostdev__ inline void init(const Coord& ijk)
+    {
+        mValues[ 1] = mAcc.getValue(ijk.offsetBy(-3,  0,  0));
+        mValues[ 2] = mAcc.getValue(ijk.offsetBy(-2,  0,  0));
+        mValues[ 3] = mAcc.getValue(ijk.offsetBy(-1,  0,  0));
+        mValues[ 4] = mAcc.getValue(ijk.offsetBy( 1,  0,  0));
+        mValues[ 5] = mAcc.getValue(ijk.offsetBy( 2,  0,  0));
+        mValues[ 6] = mAcc.getValue(ijk.offsetBy( 3,  0,  0));
+
+        mValues[ 7] = mAcc.getValue(ijk.offsetBy( 0, -3,  0));
+        mValues[ 8] = mAcc.getValue(ijk.offsetBy( 0, -2,  0));
+        mValues[ 9] = mAcc.getValue(ijk.offsetBy( 0, -1,  0));
+        mValues[10] = mAcc.getValue(ijk.offsetBy( 0,  1,  0));
+        mValues[11] = mAcc.getValue(ijk.offsetBy( 0,  2,  0));
+        mValues[12] = mAcc.getValue(ijk.offsetBy( 0,  3,  0));
+
+        mValues[13] = mAcc.getValue(ijk.offsetBy( 0,  0, -3));
+        mValues[14] = mAcc.getValue(ijk.offsetBy( 0,  0, -2));
+        mValues[15] = mAcc.getValue(ijk.offsetBy( 0,  0, -1));
+        mValues[16] = mAcc.getValue(ijk.offsetBy( 0,  0,  1));
+        mValues[17] = mAcc.getValue(ijk.offsetBy( 0,  0,  2));
+        mValues[18] = mAcc.getValue(ijk.offsetBy( 0,  0,  3));
+    }
+
+    template<typename, int, typename> friend class BaseStencil; // allow base class to call init()
+    using BaseType::mAcc;
+    using BaseType::mValues;
+    const ValueType mDx2, mInv2Dx, mInvDx2;
+}; // WenoStencil class
+
+
+// ---------------------------- CurvatureStencil ----------------------------
+
+namespace { // anonymous namespace for stencil-layout map
+
+    template<int i, int j, int k> struct CurvPt {};
+    template<> struct CurvPt< 0, 0, 0> { enum { idx = 0 }; };
+
+    template<> struct CurvPt<-1, 0, 0> { enum { idx = 1 }; };
+    template<> struct CurvPt< 1, 0, 0> { enum { idx = 2 }; };
+
+    template<> struct CurvPt< 0,-1, 0> { enum { idx = 3 }; };
+    template<> struct CurvPt< 0, 1, 0> { enum { idx = 4 }; };
+
+    template<> struct CurvPt< 0, 0,-1> { enum { idx = 5 }; };
+    template<> struct CurvPt< 0, 0, 1> { enum { idx = 6 }; };
+
+    template<> struct CurvPt<-1,-1, 0> { enum { idx = 7 }; };
+    template<> struct CurvPt< 1,-1, 0> { enum { idx = 8 }; };
+    template<> struct CurvPt<-1, 1, 0> { enum { idx = 9 }; };
+    template<> struct CurvPt< 1, 1, 0> { enum { idx =10 }; };
+
+    template<> struct CurvPt<-1, 0,-1> { enum { idx =11 }; };
+    template<> struct CurvPt< 1, 0,-1> { enum { idx =12 }; };
+    template<> struct CurvPt<-1, 0, 1> { enum { idx =13 }; };
+    template<> struct CurvPt< 1, 0, 1> { enum { idx =14 }; };
+
+    template<> struct CurvPt< 0,-1,-1> { enum { idx =15 }; };
+    template<> struct CurvPt< 0, 1,-1> { enum { idx =16 }; };
+    template<> struct CurvPt< 0,-1, 1> { enum { idx =17 }; };
+    template<> struct CurvPt< 0, 1, 1> { enum { idx =18 }; };
+
+}
+
+template<typename GridT, typename RealT = typename GridT::ValueType>
+class CurvatureStencil: public BaseStencil<CurvatureStencil<GridT>, 19, GridT>
+{
+    using SelfT     = CurvatureStencil<GridT>;
+    using BaseType  = BaseStencil<SelfT, 19, GridT>;
+public:
+    using GridType  = GridT;
+    using TreeType  = typename GridT::TreeType;
+    using ValueType = typename GridT::ValueType;
+
+    static constexpr int SIZE = 19;
+
+    __hostdev__ CurvatureStencil(const GridType& grid)
+        : BaseType(grid)
+        , mInv2Dx(ValueType(0.5 / grid.voxelSize()[0]))
+        , mInvDx2(ValueType(4.0 * mInv2Dx * mInv2Dx))
+    {
+    }
+
+    __hostdev__ CurvatureStencil(const GridType& grid, double dx)
+        : BaseType(grid)
+        , mInv2Dx(ValueType(0.5 / dx))
+        , mInvDx2(ValueType(4.0 * mInv2Dx * mInv2Dx))
+    {
+    }
+
+    /// @brief Return the mean curvature at the previously buffered location.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline ValueType meanCurvature() const
+    {
+        RealT alpha, normGrad;
+        return this->meanCurvature(alpha, normGrad) ?
+               ValueType(alpha*mInv2Dx/Pow3(normGrad)) : 0;
+    }
+
+    /// @brief Return the Gaussian curvature at the previously buffered location.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline ValueType gaussianCurvature() const
+    {
+        RealT alpha, normGrad;
+        return this->gaussianCurvature(alpha, normGrad) ?
+               ValueType(alpha*mInvDx2/Pow4(normGrad)) : 0;
+    }
+
+    /// @brief Return both the mean and the Gaussian curvature at the
+    ///        previously buffered location.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline void curvatures(ValueType &mean, ValueType& gauss) const
+    {
+        RealT alphaM, alphaG, normGrad;
+        if (this->curvatures(alphaM, alphaG, normGrad)) {
+          mean  = ValueType(alphaM*mInv2Dx/Pow3(normGrad));
+          gauss = ValueType(alphaG*mInvDx2/Pow4(normGrad));
+        } else {
+          mean = gauss = 0;
+        }
+    }
+
+    /// Return the mean curvature multiplied by the norm of the
+    /// central-difference gradient. This method is very useful for
+    /// mean-curvature flow of level sets!
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline ValueType meanCurvatureNormGrad() const
+    {
+        RealT alpha, normGrad;
+        return this->meanCurvature(alpha, normGrad) ?
+               ValueType(alpha*mInvDx2/(2*Pow2(normGrad))) : 0;
+    }
+
+    /// Return the mean Gaussian multiplied by the norm of the
+    /// central-difference gradient.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline ValueType gaussianCurvatureNormGrad() const
+    {
+        RealT alpha, normGrad;
+        return this->gaussianCurvature(alpha, normGrad) ?
+               ValueType(2*alpha*mInv2Dx*mInvDx2/Pow3(normGrad)) : 0;
+    }
+
+    /// @brief Return both the mean and the Gaussian curvature at the
+    ///        previously buffered location.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline void curvaturesNormGrad(ValueType &mean, ValueType& gauss) const
+    {
+        RealT alphaM, alphaG, normGrad;
+        if (this->curvatures(alphaM, alphaG, normGrad)) {
+          mean  = ValueType(alphaM*mInvDx2/(2*Pow2(normGrad)));
+          gauss = ValueType(2*alphaG*mInv2Dx*mInvDx2/Pow3(normGrad));
+        } else {
+          mean = gauss = 0;
+        }
+    }
+
+    /// @brief Computes the minimum and maximum principal curvature at the
+    ///        previously buffered location.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline void principalCurvatures(ValueType &min, ValueType &max) const
+    {
+        min = max = 0;
+        RealT alphaM, alphaG, normGrad;
+        if (this->curvatures(alphaM, alphaG, normGrad)) {
+            const RealT mean = alphaM*mInv2Dx/Pow3(normGrad);
+            const RealT tmp = Sqrt(mean*mean - alphaG*mInvDx2/Pow4(normGrad));
+            min = ValueType(mean - tmp);
+            max = ValueType(mean + tmp);
+        }
+    }
+
+    /// Return the Laplacian computed at the previously buffered
+    /// location by second-order central differencing.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline ValueType laplacian() const
+    {
+        return mInvDx2 * (
+            mValues[1] + mValues[2] +
+            mValues[3] + mValues[4] +
+            mValues[5] + mValues[6] - 6*mValues[0]);
+    }
+
+    /// Return the gradient computed at the previously buffered
+    /// location by second-order central differencing.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline Vec3<ValueType> gradient() const
+    {
+        return Vec3<ValueType>(
+            mValues[2] - mValues[1],
+            mValues[4] - mValues[3],
+            mValues[6] - mValues[5])*mInv2Dx;
+    }
+
+    /// Return linear offset for the specified stencil point relative to its center
+    template<int i, int j, int k>
+    __hostdev__ unsigned int pos() const { return CurvPt<i,j,k>::idx; }
+
+private:
+    __hostdev__ inline void init(const Coord &ijk)
+    {
+        mValues[ 1] = mAcc.getValue(ijk.offsetBy(-1,  0,  0));
+        mValues[ 2] = mAcc.getValue(ijk.offsetBy( 1,  0,  0));
+
+        mValues[ 3] = mAcc.getValue(ijk.offsetBy( 0, -1,  0));
+        mValues[ 4] = mAcc.getValue(ijk.offsetBy( 0,  1,  0));
+
+        mValues[ 5] = mAcc.getValue(ijk.offsetBy( 0,  0, -1));
+        mValues[ 6] = mAcc.getValue(ijk.offsetBy( 0,  0,  1));
+
+        mValues[ 7] = mAcc.getValue(ijk.offsetBy(-1, -1,  0));
+        mValues[ 8] = mAcc.getValue(ijk.offsetBy( 1, -1,  0));
+        mValues[ 9] = mAcc.getValue(ijk.offsetBy(-1,  1,  0));
+        mValues[10] = mAcc.getValue(ijk.offsetBy( 1,  1,  0));
+
+        mValues[11] = mAcc.getValue(ijk.offsetBy(-1,  0, -1));
+        mValues[12] = mAcc.getValue(ijk.offsetBy( 1,  0, -1));
+        mValues[13] = mAcc.getValue(ijk.offsetBy(-1,  0,  1));
+        mValues[14] = mAcc.getValue(ijk.offsetBy( 1,  0,  1));
+
+        mValues[15] = mAcc.getValue(ijk.offsetBy( 0, -1, -1));
+        mValues[16] = mAcc.getValue(ijk.offsetBy( 0,  1, -1));
+        mValues[17] = mAcc.getValue(ijk.offsetBy( 0, -1,  1));
+        mValues[18] = mAcc.getValue(ijk.offsetBy( 0,  1,  1));
+    }
+
+    __hostdev__ inline RealT Dx()  const { return 0.5*(mValues[2] - mValues[1]); }// * 1/dx
+    __hostdev__ inline RealT Dy()  const { return 0.5*(mValues[4] - mValues[3]); }// * 1/dx
+    __hostdev__ inline RealT Dz()  const { return 0.5*(mValues[6] - mValues[5]); }// * 1/dx
+    __hostdev__ inline RealT Dxx() const { return mValues[2] - 2 * mValues[0] + mValues[1]; }// * 1/dx2
+    __hostdev__ inline RealT Dyy() const { return mValues[4] - 2 * mValues[0] + mValues[3]; }// * 1/dx2}
+    __hostdev__ inline RealT Dzz() const { return mValues[6] - 2 * mValues[0] + mValues[5]; }// * 1/dx2
+    __hostdev__ inline RealT Dxy() const { return 0.25 * (mValues[10] - mValues[ 8] + mValues[ 7] - mValues[ 9]); }// * 1/dx2
+    __hostdev__ inline RealT Dxz() const { return 0.25 * (mValues[14] - mValues[12] + mValues[11] - mValues[13]); }// * 1/dx2
+    __hostdev__ inline RealT Dyz() const { return 0.25 * (mValues[18] - mValues[16] + mValues[15] - mValues[17]); }// * 1/dx2
+
+    __hostdev__ inline bool meanCurvature(RealT& alpha, RealT& normGrad) const
+    {
+        // For performance all finite differences are unscaled wrt dx
+        const RealT Dx  = this->Dx(), Dy = this->Dy(), Dz = this->Dz(),
+                    Dx2 = Dx*Dx, Dy2 = Dy*Dy, Dz2 = Dz*Dz, normGrad2 = Dx2 + Dy2 + Dz2;
+        if (normGrad2 <= Tolerance<RealT>::value()) {
+             alpha = normGrad = 0;
+             return false;
+        }
+        const RealT Dxx = this->Dxx(), Dyy = this->Dyy(), Dzz = this->Dzz();
+        alpha = Dx2*(Dyy + Dzz) + Dy2*(Dxx + Dzz) + Dz2*(Dxx + Dyy) -
+                2*(Dx*(Dy*this->Dxy() + Dz*this->Dxz()) + Dy*Dz*this->Dyz());// * 1/dx^4
+        normGrad = Sqrt(normGrad2); // * 1/dx
+        return true;
+    }
+
+    __hostdev__ inline bool gaussianCurvature(RealT& alpha, RealT& normGrad) const
+    {
+        // For performance all finite differences are unscaled wrt dx
+        const RealT Dx  = this->Dx(), Dy = this->Dy(), Dz = this->Dz(),
+                    Dx2 = Dx*Dx, Dy2 = Dy*Dy, Dz2 = Dz*Dz, normGrad2 = Dx2 + Dy2 + Dz2;
+        if (normGrad2 <= Tolerance<RealT>::value()) {
+             alpha = normGrad = 0;
+             return false;
+        }
+        const RealT Dxx = this->Dxx(), Dyy = this->Dyy(), Dzz = this->Dzz(),
+                   Dxy = this->Dxy(), Dxz = this->Dxz(), Dyz = this->Dyz();
+        alpha = Dx2*(Dyy*Dzz - Dyz*Dyz) + Dy2*(Dxx*Dzz - Dxz*Dxz) + Dz2*(Dxx*Dyy - Dxy*Dxy) +
+                2*( Dy*Dz*(Dxy*Dxz - Dyz*Dxx) + Dx*Dz*(Dxy*Dyz - Dxz*Dyy) + Dx*Dy*(Dxz*Dyz - Dxy*Dzz) );// * 1/dx^6
+        normGrad  = Sqrt(normGrad2); // * 1/dx
+        return true;
+    }
+
+    __hostdev__ inline bool curvatures(RealT& alphaM, RealT& alphaG, RealT& normGrad) const
+    {
+        // For performance all finite differences are unscaled wrt dx
+        const RealT Dx  = this->Dx(), Dy = this->Dy(), Dz = this->Dz(),
+                    Dx2 = Dx*Dx, Dy2 = Dy*Dy, Dz2 = Dz*Dz, normGrad2 = Dx2 + Dy2 + Dz2;
+        if (normGrad2 <= Tolerance<RealT>::value()) {
+             alphaM = alphaG =normGrad = 0;
+             return false;
+        }
+        const RealT Dxx = this->Dxx(), Dyy = this->Dyy(), Dzz = this->Dzz(),
+                    Dxy = this->Dxy(), Dxz = this->Dxz(), Dyz = this->Dyz();
+        alphaM = Dx2*(Dyy + Dzz) + Dy2*(Dxx + Dzz) + Dz2*(Dxx + Dyy) -
+                 2*(Dx*(Dy*Dxy + Dz*Dxz) + Dy*Dz*Dyz);// *1/dx^4
+        alphaG = Dx2*(Dyy*Dzz - Dyz*Dyz) + Dy2*(Dxx*Dzz - Dxz*Dxz) + Dz2*(Dxx*Dyy - Dxy*Dxy) +
+                 2*( Dy*Dz*(Dxy*Dxz - Dyz*Dxx) + Dx*Dz*(Dxy*Dyz - Dxz*Dyy) + Dx*Dy*(Dxz*Dyz - Dxy*Dzz) );// *1/dx^6
+        normGrad  = Sqrt(normGrad2); // * 1/dx
+        return true;
+    }
+
+    template<typename, int, typename> friend class BaseStencil; // allow base class to call init()
+    using BaseType::mAcc;
+    using BaseType::mValues;
+    const ValueType mInv2Dx, mInvDx2;
+}; // CurvatureStencil class
+
+}// namespace math
+
+} // end nanovdb namespace
+
+#endif // NANOVDB_MATH_STENCILS_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/tools/CreateNanoGrid.h b/nanovdb/nanovdb/tools/CreateNanoGrid.h
new file mode 100644
index 0000000000..0615cb6e22
--- /dev/null
+++ b/nanovdb/nanovdb/tools/CreateNanoGrid.h
@@ -0,0 +1,2073 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    \file nanovdb/tools/CreateNanoGrid.h
+
+    \author Ken Museth
+
+    \date June 26, 2020
+
+    \note In the examples below we assume that @c srcGrid is a exiting grid of type
+          SrcGridT = @c openvdb::FloatGrid, @c openvdb::FloatGrid or @c nanovdb::tools::build::FloatGrid.
+
+    \brief Convert any grid to a nanovdb grid of the same type, e.g. float->float
+    \code
+    auto handle = nanovdb::tools::createNanoGrid(srcGrid);
+    auto *dstGrid = handle.grid<float>();
+    \endcode
+
+    \brief Convert a grid to a nanovdb grid of a different type, e.g. float->half
+    \code
+    auto handle = nanovdb::tools::createNanoGrid<SrcGridT,nanovdb::Fp16>(srcGrid);
+    auto *dstGrid = handle.grid<nanovdb::Fp16>();
+    \endcode
+
+    \brief Convert a grid to a nanovdb grid of the same type but using a CUDA buffer
+    \code
+    auto handle = nanovdb::tools::createNanoGrid<SrcGridT, float, nanovdb::CudaDeviceBuffer>(srcGrid);
+    auto *dstGrid = handle.grid<float>();
+    \endcode
+
+    \brief Create a nanovdb grid that indices values in an existing source grid of any type.
+           If DstBuildT = nanovdb::ValueIndex both active and in-active values are indexed
+           and if DstBuildT = nanovdb::ValueOnIndex only active values are indexed.
+    \code
+    using DstBuildT = nanovdb::ValueIndex;// index both active an inactive values
+    auto handle = nanovdb::tools::createNanoGridSrcGridT,DstBuildT>(srcGrid,0,false,false);//no blind data, tile values or stats
+    auto *dstGrid = handle.grid<DstBuildT>();
+    \endcode
+
+    \brief Create a NanoVDB grid from scratch
+    \code
+#if defined(NANOVDB_USE_OPENVDB) && !defined(__CUDACC__)
+    using SrcGridT = openvdb::FloatGrid;
+#else
+    using SrcGridT = nanovdb::tools::build::FloatGrid;
+#endif
+    SrcGridT srcGrid(0.0f);// create an empty source grid
+    auto srcAcc = srcGrid.getAccessor();// create an accessor
+    srcAcc.setValue(nanovdb::Coord(1,2,3), 1.0f);// set a voxel value
+
+    auto handle = nanovdb::tools::createNanoGrid(srcGrid);// convert source grid to a grid handle
+    auto dstGrid = handle.grid<float>();// get a pointer to the destination grid
+    \endcode
+
+    \brief Convert a base-pointer to an openvdb grid, denoted srcGrid, to a  nanovdb
+           grid of the same type, e.g. float -> float or openvdb::Vec3f -> nanovdb::Vec3f
+    \code
+    auto handle = nanovdb::openToNanoVDB(*srcGrid);// convert source grid to a grid handle
+    auto dstGrid = handle.grid<float>();// get a pointer to the destination grid
+    \endcode
+
+    \brief Converts any existing grid to a NanoVDB grid, for example:
+           nanovdb::tools::build::Grid<SrcBuildT> -> nanovdb::Grid<DstBuildT>
+           nanovdb::Grid<SrcBuildT> -> nanovdb::Grid<DstBuildT>
+           nanovdb::Grid<SrcBuildT> -> nanovdb::Grid<ValueIndex or ValueOnIndex>
+           openvdb::Grid<SrcBuildT> -> nanovdb::Grid<DstBuildT>
+           openvdb::Grid<PointIndex> -> nanovdb::Grid<PointIndex>
+           openvdb::Grid<PointData> -> nanovdb::Grid<PointData>
+           openvdb::Grid<SrcBuildT> -> nanovdb::Grid<ValueIndex or ValueOnIndex>
+
+    \note This files replaces GridBuilder.h, IndexGridBuilder.h and OpenToNanoVDB.h
+*/
+
+#ifndef NANOVDB_TOOLS_CREATENANOGRID_H_HAS_BEEN_INCLUDED
+#define NANOVDB_TOOLS_CREATENANOGRID_H_HAS_BEEN_INCLUDED
+
+#if defined(NANOVDB_USE_OPENVDB) && !defined(__CUDACC__)
+#include <openvdb/openvdb.h>
+#include <openvdb/points/PointDataGrid.h>
+#include <openvdb/tools/PointIndexGrid.h>
+#endif
+
+#include <nanovdb/NodeManager.h>
+#include <nanovdb/GridHandle.h>
+#include <nanovdb/tools/GridBuilder.h>
+#include <nanovdb/tools/GridStats.h>
+#include <nanovdb/tools/GridChecksum.h>
+#include <nanovdb/util/Range.h>
+#include <nanovdb/util/Invoke.h>
+#include <nanovdb/util/ForEach.h>
+#include <nanovdb/util/Reduce.h>
+#include <nanovdb/util/PrefixSum.h>
+#include <nanovdb/math/DitherLUT.h>// for nanovdb::math::DitherLUT
+
+#include <limits>
+#include <vector>
+#include <set>
+#include <cstring> // for memcpy
+#include <type_traits>
+
+namespace nanovdb {// ============================================================================
+
+namespace tools {// ==============================================================================
+
+// Forward declarations (defined below)
+template <typename> class CreateNanoGrid;
+class AbsDiff;
+template <typename> struct MapToNano;
+
+//================================================================================================
+
+#if defined(NANOVDB_USE_OPENVDB) && !defined(__CUDACC__)
+/// @brief Forward declaration of free-standing function that converts an OpenVDB GridBase into a NanoVDB GridHandle
+/// @tparam BufferT Type of the buffer used to allocate the destination grid
+/// @param base Shared pointer to a base openvdb grid to be converted
+/// @param sMode Mode for computing statistics of the destination grid
+/// @param cMode Mode for computing checksums of the destination grid
+/// @param verbose Mode of verbosity
+/// @return Handle to the destination NanoGrid
+template<typename BufferT = HostBuffer>
+GridHandle<BufferT>
+openToNanoVDB(const openvdb::GridBase::Ptr& base,
+              StatsMode                     sMode = StatsMode::Default,
+              CheckMode                     cMode = CheckMode::Default,
+              int                           verbose = 0);
+#endif
+
+//================================================================================================
+
+/// @brief Freestanding function that creates a NanoGrid<T> from any source grid
+/// @tparam SrcGridT Type of in input (source) grid, e.g. openvdb::Grid or nanovdb::Grid
+/// @tparam DstBuildT Type of values in the output (destination) nanovdb Grid, e.g. float or nanovdb::Fp16
+/// @tparam BufferT Type of the buffer used ti allocate the destination grid
+/// @param srcGrid Input (source) grid to be converted
+/// @param sMode  Mode for computing statistics of the destination grid
+/// @param cMode  Mode for computing checksums of the destination grid
+/// @param verbose Mode of verbosity
+/// @param buffer Instance of a buffer used for allocation
+/// @return Handle to the destination NanoGrid
+template<typename SrcGridT,
+         typename DstBuildT = typename MapToNano<typename SrcGridT::BuildType>::type,
+         typename BufferT = HostBuffer>
+typename util::disable_if<BuildTraits<DstBuildT>::is_index || BuildTraits<DstBuildT>::is_Fp, GridHandle<BufferT>>::type
+createNanoGrid(const SrcGridT &srcGrid,
+               StatsMode sMode = StatsMode::Default,
+               CheckMode cMode = CheckMode::Default,
+               int verbose = 0,
+               const BufferT &buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Freestanding function that creates a NanoGrid<ValueIndex> or NanoGrid<ValueOnIndex> from any source grid
+/// @tparam SrcGridT Type of in input (source) grid, e.g. openvdb::Grid or nanovdb::Grid
+/// @tparam DstBuildT If ValueIndex all (active and inactive) values are indexed and if
+///         it is ValueOnIndex only active values are indexed.
+/// @tparam BufferT BufferT Type of the buffer used ti allocate the destination grid
+/// @param channels If non-zero the values (active or all) in @c srcGrid are encoded as blind
+///                 data in the output index grid. @c channels indicates the number of copies
+///                 of these blind data
+/// @param includeStats If true all tree nodes will includes indices for stats, i.e. min/max/avg/std-div
+/// @param includeTiles If false on values in leaf nodes are indexed
+/// @param verbose Mode of verbosity
+/// @param buffer Instance of a buffer used for allocation
+/// @return Handle to the destination NanoGrid<T> where T = ValueIndex or ValueOnIndex
+template<typename SrcGridT,
+         typename DstBuildT = typename MapToNano<typename SrcGridT::BuildType>::type,
+         typename BufferT = HostBuffer>
+typename util::enable_if<BuildTraits<DstBuildT>::is_index, GridHandle<BufferT>>::type
+createNanoGrid(const SrcGridT &srcGrid,
+               uint32_t channels = 0u,
+               bool includeStats = true,
+               bool includeTiles = true,
+               int verbose = 0,
+               const BufferT &buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Freestanding function to create a NanoGrid<FpN> from any source grid
+/// @tparam SrcGridT Type of in input (source) grid, e.g. openvdb::Grid or nanovdb::Grid
+/// @tparam DstBuildT = FpN, i.e. variable bit-width of the output grid
+/// @tparam OracleT Type of the oracle used to determine the local bit-width, i.e. N in FpN
+/// @tparam BufferT Type of the buffer used to allocate the destination grid
+/// @param srcGrid Input (source) grid to be converted
+/// @param ditherOn switch to enable or disable dithering of quantization error
+/// @param sMode Mode for computing statistics of the destination grid
+/// @param cMode Mode for computing checksums of the destination grid
+/// @param verbose Mode of verbosity
+/// @param oracle Instance of a oracle used  to determine the local bit-width, i.e. N in FpN
+/// @param buffer Instance of a buffer used for allocation
+/// @return Handle to the destination NanoGrid
+template<typename SrcGridT,
+         typename DstBuildT = typename MapToNano<typename SrcGridT::BuildType>::type,
+         typename OracleT = AbsDiff,
+         typename BufferT = HostBuffer>
+typename util::enable_if<util::is_same<FpN, DstBuildT>::value, GridHandle<BufferT>>::type
+createNanoGrid(const SrcGridT &srcGrid,
+               StatsMode sMode = StatsMode::Default,
+               CheckMode cMode = CheckMode::Default,
+               bool ditherOn = false,
+               int verbose = 0,
+               const OracleT &oracle = OracleT(),
+               const BufferT &buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Freestanding function to create a NanoGrid<FpX> from any source grid, X=4,8,16
+/// @tparam SrcGridT Type of in input (source) grid, e.g. openvdb::Grid or nanovdb::Grid
+/// @tparam DstBuildT = Fp4, Fp8 or Fp16, i.e. quantization bit-width of the output grid
+/// @tparam BufferT Type of the buffer used to allocate the destination grid
+/// @param srcGrid Input (source) grid to be converted
+/// @param ditherOn switch to enable or disable dithering of quantization error
+/// @param sMode Mode for computing statistics of the destination grid
+/// @param cMode Mode for computing checksums of the destination grid
+/// @param verbose Mode of verbosity
+/// @param buffer Instance of a buffer used for allocation
+/// @return Handle to the destination NanoGrid
+template<typename SrcGridT,
+         typename DstBuildT = typename MapToNano<typename SrcGridT::BuildType>::type,
+         typename BufferT = HostBuffer>
+typename util::enable_if<BuildTraits<DstBuildT>::is_FpX, GridHandle<BufferT>>::type
+createNanoGrid(const SrcGridT &srcGrid,
+               StatsMode sMode = StatsMode::Default,
+               CheckMode cMode = CheckMode::Default,
+               bool ditherOn = false,
+               int verbose = 0,
+               const BufferT &buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Compression oracle based on absolute difference
+class AbsDiff
+{
+    float mTolerance;// absolute error tolerance
+public:
+    /// @note The default value of -1 means it's un-initialized!
+    AbsDiff(float tolerance = -1.0f) : mTolerance(tolerance) {}
+    AbsDiff(const AbsDiff&) = default;
+    ~AbsDiff() = default;
+    operator bool() const {return mTolerance>=0.0f;}
+    void init(nanovdb::GridClass gClass, float background) {
+        if (gClass == GridClass::LevelSet) {
+            static const float halfWidth = 3.0f;
+            mTolerance = 0.1f * background / halfWidth;// range of ls: [-3dx; 3dx]
+        } else if (gClass == GridClass::FogVolume) {
+            mTolerance = 0.01f;// range of FOG volumes: [0;1]
+        } else {
+            mTolerance = 0.0f;
+        }
+    }
+    void  setTolerance(float tolerance) { mTolerance = tolerance; }
+    float getTolerance() const { return mTolerance; }
+    /// @brief Return true if the approximate value is within the accepted
+    ///        absolute error bounds of the exact value.
+    ///
+    /// @details Required member method
+    bool  operator()(float exact, float approx) const
+    {
+        return math::Abs(exact - approx) <= mTolerance;
+    }
+};// AbsDiff
+
+inline std::ostream& operator<<(std::ostream& os, const AbsDiff& diff)
+{
+    os << "Absolute tolerance: " << diff.getTolerance();
+    return os;
+}
+
+//================================================================================================
+
+/// @brief Compression oracle based on relative difference
+class RelDiff
+{
+    float mTolerance;// relative error tolerance
+public:
+    /// @note The default value of -1 means it's un-initialized!
+    RelDiff(float tolerance = -1.0f) : mTolerance(tolerance) {}
+    RelDiff(const RelDiff&) = default;
+    ~RelDiff() = default;
+    operator bool() const {return mTolerance>=0.0f;}
+    void  setTolerance(float tolerance) { mTolerance = tolerance; }
+    float getTolerance() const { return mTolerance; }
+    /// @brief Return true if the approximate value is within the accepted
+    ///        relative error bounds of the exact value.
+    ///
+    /// @details Required member method
+    bool  operator()(float exact, float approx) const
+    {
+        return  math::Abs(exact - approx)/math::Max(math::Abs(exact), math::Abs(approx)) <= mTolerance;
+    }
+};// RelDiff
+
+inline std::ostream& operator<<(std::ostream& os, const RelDiff& diff)
+{
+    os << "Relative tolerance: " << diff.getTolerance();
+    return os;
+}
+
+//================================================================================================
+
+/// @brief The NodeAccessor provides a uniform API for accessing nodes got NanoVDB, OpenVDB and build Grids
+///
+/// @note General implementation that works with nanovdb::tools::build::Grid
+template <typename GridT>
+class NodeAccessor
+{
+public:
+    static constexpr bool IS_OPENVDB = false;
+    static constexpr bool IS_NANOVDB = false;
+    using BuildType = typename GridT::BuildType;
+    using ValueType = typename GridT::ValueType;
+    using GridType = GridT;
+    using TreeType = typename GridT::TreeType;
+    using RootType = typename TreeType::RootNodeType;
+    template<int LEVEL>
+    using NodeType = typename NodeTrait<const TreeType, LEVEL>::type;
+    NodeAccessor(const GridT &grid) : mMgr(const_cast<GridT&>(grid)) {}
+    const GridType& grid() const {return mMgr.grid();}
+    const TreeType& tree() const {return mMgr.tree();}
+    const RootType& root() const {return mMgr.root();}
+    uint64_t nodeCount(int level) const { return mMgr.nodeCount(level); }
+    template <int LEVEL>
+    const NodeType<LEVEL>& node(uint32_t i) const {return mMgr.template node<LEVEL>(i); }
+    const std::string& getName() const {return this->grid().getName();};
+    bool hasLongGridName() const {return this->grid().getName().length() >= GridData::MaxNameSize;}
+    const nanovdb::Map& map() const {return this->grid().map();}
+    GridClass gridClass() const {return this->grid().gridClass();}
+private:
+    build::NodeManager<GridT> mMgr;
+};// NodeAccessor<GridT>
+
+//================================================================================================
+
+/// @brief Template specialization for nanovdb::Grid which is special since its NodeManage
+///         uses a handle in order to support node access on the GPU!
+template <typename BuildT>
+class NodeAccessor< NanoGrid<BuildT> >
+{
+public:
+    static constexpr bool IS_OPENVDB = false;
+    static constexpr bool IS_NANOVDB = true;
+    using BuildType = BuildT;
+    using BufferType = HostBuffer;
+    using GridType = NanoGrid<BuildT>;
+    using ValueType = typename GridType::ValueType;
+    using TreeType = typename GridType::TreeType;
+    using RootType = typename TreeType::RootType;
+    template<int LEVEL>
+    using NodeType = typename NodeTrait<TreeType, LEVEL>::type;
+    NodeAccessor(const GridType &grid)
+        : mHandle(createNodeManager<BuildT, BufferType>(grid))
+        , mMgr(*(mHandle.template mgr<BuildT>())) {}
+    const GridType& grid() const {return mMgr.grid();}
+    const TreeType& tree() const {return mMgr.tree();}
+    const RootType& root() const {return mMgr.root();}
+    uint64_t nodeCount(int level) const { return mMgr.nodeCount(level); }
+    template <int LEVEL>
+    const NodeType<LEVEL>& node(uint32_t i) const {return mMgr.template node<LEVEL>(i); }
+    std::string getName() const {return std::string(this->grid().gridName());};
+    bool hasLongGridName() const {return this->grid().hasLongGridName();}
+    const nanovdb::Map& map() const {return this->grid().map();}
+    GridClass gridClass() const {return this->grid().gridClass();}
+private:
+    NodeManagerHandle<BufferType> mHandle;
+    const NodeManager<BuildT>    &mMgr;
+};// NodeAccessor<nanovdb::Grid>
+
+//================================================================================================
+
+/// @brief Trait that maps any type to the corresponding nanovdb type
+/// @tparam T Type to be mapped
+template<typename T>
+struct MapToNano { using type = T; };
+
+#if defined(NANOVDB_USE_OPENVDB) && !defined(__CUDACC__)
+
+template<>
+struct MapToNano<openvdb::ValueMask> {using type = nanovdb::ValueMask;};
+template<typename T>
+struct MapToNano<openvdb::math::Vec3<T>>{using type = nanovdb::math::Vec3<T>;};
+template<typename T>
+struct MapToNano<openvdb::math::Vec4<T>>{using type = nanovdb::math::Vec4<T>;};
+template<>
+struct MapToNano<openvdb::PointIndex32> {using type = uint32_t;};
+template<>
+struct MapToNano<openvdb::PointDataIndex32> {using type = uint32_t;};
+
+/// Templated Grid with default 32->16->8 configuration
+template <typename BuildT>
+using OpenLeaf = openvdb::tree::LeafNode<BuildT,3>;
+template <typename BuildT>
+using OpenLower = openvdb::tree::InternalNode<OpenLeaf<BuildT>,4>;
+template <typename BuildT>
+using OpenUpper = openvdb::tree::InternalNode<OpenLower<BuildT>,5>;
+template <typename BuildT>
+using OpenRoot = openvdb::tree::RootNode<OpenUpper<BuildT>>;
+template <typename BuildT>
+using OpenTree = openvdb::tree::Tree<OpenRoot<BuildT>>;
+template <typename BuildT>
+using OpenGrid = openvdb::Grid<OpenTree<BuildT>>;
+
+//================================================================================================
+
+/// @brief Template specialization for openvdb::Grid
+template <typename BuildT>
+class NodeAccessor<OpenGrid<BuildT>>
+{
+public:
+    static constexpr bool IS_OPENVDB = true;
+    static constexpr bool IS_NANOVDB = false;
+    using BuildType = BuildT;
+    using GridType = OpenGrid<BuildT>;
+    using ValueType = typename GridType::ValueType;
+    using TreeType = OpenTree<BuildT>;
+    using RootType = OpenRoot<BuildT>;
+    template<int LEVEL>
+    using NodeType = typename NodeTrait<const TreeType, LEVEL>::type;
+    NodeAccessor(const GridType &grid) : mMgr(const_cast<GridType&>(grid)) {
+        const auto mat4 = this->grid().transform().baseMap()->getAffineMap()->getMat4();
+        mMap.set(mat4, mat4.inverse());
+    }
+    const GridType& grid() const {return mMgr.grid();}
+    const TreeType& tree() const {return mMgr.tree();}
+    const RootType& root() const {return mMgr.root();}
+    uint64_t nodeCount(int level) const { return mMgr.nodeCount(level); }
+    template <int LEVEL>
+    const NodeType<LEVEL>& node(uint32_t i) const {return mMgr.template node<LEVEL>(i); }
+    std::string getName() const { return this->grid().getName(); };
+    bool hasLongGridName() const {return this->grid().getName().length() >= GridData::MaxNameSize;}
+    const nanovdb::Map& map() const {return mMap;}
+    GridClass gridClass() const {
+        switch (this->grid().getGridClass()) {
+        case openvdb::GRID_LEVEL_SET:
+            if (!util::is_floating_point<BuildT>::value) OPENVDB_THROW(openvdb::ValueError, "processGrid: Level sets are expected to be floating point types");
+            return GridClass::LevelSet;
+        case openvdb::GRID_FOG_VOLUME:
+            return GridClass::FogVolume;
+        case openvdb::GRID_STAGGERED:
+            return GridClass::Staggered;
+        default:
+            return GridClass::Unknown;
+        }
+    }
+private:
+    build::NodeManager<GridType> mMgr;
+    nanovdb::Map                 mMap;
+};// NodeAccessor<openvdb::Grid<T>>
+
+//================================================================================================
+
+/// @brief Template specialization for openvdb::tools::PointIndexGrid
+template <>
+class NodeAccessor<openvdb::tools::PointIndexGrid>
+{
+public:
+    static constexpr bool IS_OPENVDB = true;
+    static constexpr bool IS_NANOVDB = false;
+    using BuildType = openvdb::PointIndex32;
+    using GridType = openvdb::tools::PointIndexGrid;
+    using TreeType = openvdb::tools::PointIndexTree;
+    using RootType = typename TreeType::RootNodeType;
+    using ValueType = typename GridType::ValueType;
+    template<int LEVEL>
+    using NodeType = typename NodeTrait<const TreeType, LEVEL>::type;
+    NodeAccessor(const GridType &grid) : mMgr(const_cast<GridType&>(grid)) {
+        const auto mat4 = this->grid().transform().baseMap()->getAffineMap()->getMat4();
+        mMap.set(mat4, mat4.inverse());
+    }
+    const GridType& grid() const {return mMgr.grid();}
+    const TreeType& tree() const {return mMgr.tree();}
+    const RootType& root() const {return mMgr.root();}
+    uint64_t nodeCount(int level) const { return mMgr.nodeCount(level); }
+    template <int LEVEL>
+    const NodeType<LEVEL>& node(uint32_t i) const {return mMgr.template node<LEVEL>(i); }
+    std::string getName() const { return this->grid().getName(); };
+    bool hasLongGridName() const {return this->grid().getName().length() >= GridData::MaxNameSize;}
+    const nanovdb::Map& map() const {return mMap;}
+    GridClass gridClass() const {return GridClass::PointIndex;}
+private:
+    build::NodeManager<GridType> mMgr;
+    nanovdb::Map                 mMap;
+};// NodeAccessor<openvdb::tools::PointIndexGrid>
+
+//================================================================================================
+
+// @brief Template specialization for openvdb::points::PointDataGrid
+template <>
+class NodeAccessor<openvdb::points::PointDataGrid>
+{
+public:
+    static constexpr bool IS_OPENVDB = true;
+    static constexpr bool IS_NANOVDB = false;
+    using BuildType = openvdb::PointDataIndex32;
+    using GridType = openvdb::points::PointDataGrid;
+    using TreeType = openvdb::points::PointDataTree;
+    using RootType = typename TreeType::RootNodeType;
+    using ValueType = typename GridType::ValueType;
+    template<int LEVEL>
+    using NodeType = typename NodeTrait<const TreeType, LEVEL>::type;
+    NodeAccessor(const GridType &grid) : mMgr(const_cast<GridType&>(grid)) {
+        const auto mat4 = this->grid().transform().baseMap()->getAffineMap()->getMat4();
+        mMap.set(mat4, mat4.inverse());
+    }
+    const GridType& grid() const {return mMgr.grid();}
+    const TreeType& tree() const {return mMgr.tree();}
+    const RootType& root() const {return mMgr.root();}
+    uint64_t nodeCount(int level) const { return mMgr.nodeCount(level); }
+    template <int LEVEL>
+    const NodeType<LEVEL>& node(uint32_t i) const {return mMgr.template node<LEVEL>(i); }
+    std::string getName() const { return this->grid().getName(); };
+    bool hasLongGridName() const {return this->grid().getName().length() >= GridData::MaxNameSize;}
+    const nanovdb::Map& map() const {return mMap;}
+    GridClass gridClass() const {return GridClass::PointData;}
+private:
+    build::NodeManager<GridType> mMgr;
+    nanovdb::Map                 mMap;
+};// NodeAccessor<openvdb::points::PointDataGrid>
+
+#endif
+
+//================================================================================================
+
+/// @brief Creates any nanovdb Grid from any source grid (certain combinations are obviously not allowed)
+template <typename SrcGridT>
+class CreateNanoGrid
+{
+public:
+    // SrcGridT can be either openvdb::Grid, nanovdb::Grid or nanovdb::tools::build::Grid
+    using SrcNodeAccT = NodeAccessor<SrcGridT>;
+    using SrcBuildT = typename SrcNodeAccT::BuildType;
+    using SrcValueT = typename SrcNodeAccT::ValueType;
+    using SrcTreeT  = typename SrcNodeAccT::TreeType;
+    using SrcRootT  = typename SrcNodeAccT::RootType;
+    template <int LEVEL>
+    using SrcNodeT = typename NodeTrait<SrcRootT, LEVEL>::type;
+
+    /// @brief Constructor from a source grid
+    /// @param srcGrid Source grid of type SrcGridT
+    CreateNanoGrid(const SrcGridT &srcGrid);
+
+    /// @brief Constructor from a source node accessor (defined above)
+    /// @param srcNodeAcc Source node accessor of type SrcNodeAccT
+    CreateNanoGrid(const SrcNodeAccT &srcNodeAcc);
+
+    /// @brief Set the level of verbosity
+    /// @param mode level of verbosity, mode=0 means quiet
+    void setVerbose(int mode = 1) { mVerbose = mode; }
+
+    /// @brief Enable or disable dithering, i.e. randomization of the quantization error.
+    /// @param on enable or disable dithering
+    /// @warning Dithering only has an affect when DstBuildT = {Fp4, Fp8, Fp16, FpN}
+    void enableDithering(bool on = true) { mDitherOn = on; }
+
+    /// @brief Set the mode used for computing statistics of the destination grid
+    /// @param mode specify the mode of statistics
+    void setStats(StatsMode mode = StatsMode::Default) { mStats = mode; }
+
+    /// @brief Set the mode used for computing checksums of the destination grid
+    /// @param mode specify the mode of checksum
+    void setChecksum(CheckMode mode = CheckMode::Default) { mChecksum = mode; }
+
+    /// @brief Converts the source grid into a nanovdb grid with the specified destination build type
+    /// @tparam DstBuildT build type of the destination, output, grid
+    /// @tparam BufferT Type of the buffer used for allocating the destination grid
+    /// @param buffer instance of the buffer use for allocation
+    /// @return Return an instance of a GridHandle (invoking move semantics)
+    /// @note This version is when DstBuildT != {FpN, ValueIndex, ValueOnIndex}
+    template<typename DstBuildT = typename MapToNano<SrcBuildT>::type, typename BufferT = HostBuffer>
+    typename util::disable_if<util::is_same<DstBuildT, FpN>::value ||
+                        BuildTraits<DstBuildT>::is_index, GridHandle<BufferT>>::type
+    getHandle(const BufferT &buffer = BufferT());
+
+    /// @brief Converts the source grid into a nanovdb grid with variable bit quantization
+    /// @tparam DstBuildT FpN, i.e. the destination grid uses variable bit quantization
+    /// @tparam OracleT Type of oracle used to determine the N in FpN
+    /// @tparam BufferT Type of the buffer used for allocating the destination grid
+    /// @param oracle Instance of the oracle used to determine the N in FpN
+    /// @param buffer instance of the buffer use for allocation
+    /// @return Return an instance of a GridHandle (invoking move semantics)
+    /// @note This version assumes DstBuildT == FpN
+    template<typename DstBuildT = typename MapToNano<SrcBuildT>::type, typename OracleT = AbsDiff, typename BufferT = HostBuffer>
+    typename util::enable_if<util::is_same<DstBuildT, FpN>::value, GridHandle<BufferT>>::type
+    getHandle(const OracleT &oracle = OracleT(),
+              const BufferT &buffer = BufferT());
+
+    /// @brief Converts the source grid into a nanovdb grid with indices to external arrays of values
+    /// @tparam DstBuildT ValueIndex or ValueOnIndex, i.e. index all or just active values
+    /// @tparam BufferT Type of the buffer used for allocating the destination grid
+    /// @param channels Number of copies of values encoded as blind data in the destination grid
+    /// @param includeStats Specify if statics should be indexed
+    /// @param includeTiles Specify if tile values, i.e. non-leaf-node-values, should be indexed
+    /// @param buffer instance of the buffer use for allocation
+    /// @return Return an instance of a GridHandle (invoking move semantics)
+    template<typename DstBuildT = typename MapToNano<SrcBuildT>::type, typename BufferT = HostBuffer>
+    typename util::enable_if<BuildTraits<DstBuildT>::is_index, GridHandle<BufferT>>::type
+    getHandle(uint32_t channels = 0u,
+              bool includeStats = true,
+              bool includeTiles = true,
+              const BufferT &buffer = BufferT());
+
+    /// @brief Add blind data to the destination grid
+    /// @param name String name of the blind data
+    /// @param dataSemantic Semantics of the blind data
+    /// @param dataClass Class of the blind data
+    /// @param dataType Type of the blind data
+    /// @param count Element count of the blind data
+    /// @param size Size of each element of the blind data
+    /// @return Return the index used to access the blind data
+    uint64_t addBlindData(const std::string& name,
+                          GridBlindDataSemantic dataSemantic,
+                          GridBlindDataClass dataClass,
+                          GridType dataType,
+                          size_t count, size_t size)
+    {
+        const size_t order = mBlindMetaData.size();
+        mBlindMetaData.emplace(name, dataSemantic, dataClass, dataType, order, count, size);
+        return order;
+    }
+
+    /// @brief This method only has affect when getHandle was called with DstBuildT = ValueIndex or ValueOnIndex
+    /// @return Return the number of indexed values. If called before getHandle was called with
+    ///         DstBuildT = ValueIndex or ValueOnIndex the return value is zero. Else it is a value larger than zero.
+    uint64_t valueCount() const {return mValIdx[0].empty() ? 0u : mValIdx[0].back();}
+
+    /// @brief Copy values from the source grid into a provided buffer
+    /// @tparam DstBuildT Must be ValueIndex or ValueOnIndex, i.e. a index grid
+    /// @param buffer point in which to write values
+    template <typename DstBuildT>
+    typename util::enable_if<BuildTraits<DstBuildT>::is_index>::type
+    copyValues(SrcValueT *buffer);
+
+private:
+
+    // =========================================================
+
+    template <typename T, int LEVEL>
+    typename util::enable_if<!(util::is_same<T,FpN>::value&&LEVEL==0), typename NodeTrait<NanoRoot<T>, LEVEL>::type*>::type
+    dstNode(uint64_t i) const {
+        static_assert(LEVEL==0 || LEVEL==1 || LEVEL==2, "Expected LEVEL== {0,1,2}");
+        using NodeT = typename NodeTrait<NanoRoot<T>, LEVEL>::type;
+        return util::PtrAdd<NodeT>(mBufferPtr, mOffset[5-LEVEL]) + i;
+    }
+    template <typename T, int LEVEL>
+    typename util::enable_if<util::is_same<T,FpN>::value && LEVEL==0, NanoLeaf<FpN>*>::type
+    dstNode(uint64_t i) const {return util::PtrAdd<NanoLeaf<FpN>>(mBufferPtr, mCodec[i].offset);}
+
+    template <typename T> NanoRoot<T>* dstRoot() const {return util::PtrAdd<NanoRoot<T>>(mBufferPtr, mOffset.root);}
+    template <typename T> NanoTree<T>* dstTree() const {return util::PtrAdd<NanoTree<T>>(mBufferPtr, mOffset.tree);}
+    template <typename T> NanoGrid<T>* dstGrid() const {return util::PtrAdd<NanoGrid<T>>(mBufferPtr, mOffset.grid);}
+    GridBlindMetaData* dstMeta(uint32_t i) const { return util::PtrAdd<GridBlindMetaData>(mBufferPtr, mOffset.meta) + i;};
+
+    // =========================================================
+
+    template <typename DstBuildT>
+    typename util::disable_if<util::is_same<FpN,DstBuildT>::value || BuildTraits<DstBuildT>::is_index>::type
+    preProcess();
+
+    template <typename DstBuildT>
+    typename util::enable_if<BuildTraits<DstBuildT>::is_index>::type
+    preProcess(uint32_t channels);
+
+    template <typename DstBuildT, typename OracleT>
+    typename util::enable_if<util::is_same<FpN, DstBuildT>::value>::type
+    preProcess(OracleT oracle);
+
+    // =========================================================
+
+    // Below are private methods use to serialize nodes into NanoVDB
+    template<typename DstBuildT, typename BufferT>
+    GridHandle<BufferT> initHandle(const BufferT& buffer);
+
+    // =========================================================
+
+    template <typename DstBuildT>
+    inline typename util::enable_if<BuildTraits<DstBuildT>::is_index>::type
+    postProcess(uint32_t channels);
+
+    template <typename DstBuildT>
+    inline typename util::disable_if<BuildTraits<DstBuildT>::is_index>::type
+    postProcess();
+
+    // ========================================================
+
+    template<typename DstBuildT>
+    typename util::disable_if<BuildTraits<DstBuildT>::is_special>::type
+    processLeafs();
+
+    template<typename DstBuildT>
+    typename util::enable_if<BuildTraits<DstBuildT>::is_index>::type
+    processLeafs();
+
+    template<typename DstBuildT>
+    typename util::enable_if<BuildTraits<DstBuildT>::is_FpX>::type
+    processLeafs();
+
+    template<typename DstBuildT>
+    typename util::enable_if<util::is_same<FpN, DstBuildT>::value>::type
+    processLeafs();
+
+    template<typename DstBuildT>
+    typename util::enable_if<util::is_same<bool, DstBuildT>::value>::type
+    processLeafs();
+
+    template<typename DstBuildT>
+    typename util::enable_if<util::is_same<ValueMask, DstBuildT>::value>::type
+    processLeafs();
+
+    // =========================================================
+
+    template<typename DstBuildT, int LEVEL>
+    typename util::enable_if<BuildTraits<DstBuildT>::is_index>::type
+    processInternalNodes();
+
+    template<typename DstBuildT, int LEVEL>
+    typename util::enable_if<!BuildTraits<DstBuildT>::is_index>::type
+    processInternalNodes();
+
+    // =========================================================
+
+    template <typename DstBuildT>
+    typename util::enable_if<BuildTraits<DstBuildT>::is_index>::type
+    processRoot();
+
+    template <typename DstBuildT>
+    typename util::enable_if<!BuildTraits<DstBuildT>::is_index>::type
+    processRoot();
+
+    // =========================================================
+
+    template<typename DstBuildT>
+    void processTree();
+
+    template<typename DstBuildT>
+    void processGrid();
+
+    template <typename DstBuildT, int LEVEL>
+    typename util::enable_if<BuildTraits<DstBuildT>::is_index, uint64_t>::type
+    countTileValues(uint64_t valueCount);
+
+    template <typename DstBuildT>
+    typename util::enable_if<BuildTraits<DstBuildT>::is_index, uint64_t>::type
+    countValues();
+
+#if defined(NANOVDB_USE_OPENVDB) && !defined(__CUDACC__)
+    template<typename T = SrcGridT>
+    typename util::disable_if<util::is_same<T, openvdb::tools::PointIndexGrid>::value ||
+                               util::is_same<T, openvdb::points::PointDataGrid>::value, uint64_t>::type
+    countPoints() const;
+
+    template<typename T = SrcGridT>
+    typename util::enable_if<util::is_same<T, openvdb::tools::PointIndexGrid>::value ||
+                       util::is_same<T, openvdb::points::PointDataGrid>::value, uint64_t>::type
+    countPoints() const;
+
+    template<typename DstBuildT, typename AttT, typename CodecT = openvdb::points::UnknownCodec, typename T = SrcGridT>
+    typename util::enable_if<util::is_same<openvdb::points::PointDataGrid, T>::value>::type
+    copyPointAttribute(size_t attIdx, AttT *attPtr);
+#else
+    uint64_t countPoints() const {return 0u;}
+#endif
+
+    void*                    mBufferPtr;// pointer to the beginning of the destination nanovdb grid buffer
+    struct BufferOffsets {
+        uint64_t grid, tree, root, upper, lower, leaf, meta, blind, size;
+        uint64_t operator[](int i) const { return *(reinterpret_cast<const uint64_t*>(this)+i); }
+    }                        mOffset;
+    int                      mVerbose;
+    uint64_t                 mLeafNodeSize;// non-trivial when DstBuiltT = FpN
+
+    std::unique_ptr<SrcNodeAccT> mSrcNodeAccPtr;// placeholder for potential local instance
+    const SrcNodeAccT       &mSrcNodeAcc;
+    struct BlindMetaData; // forward declaration
+    std::set<BlindMetaData>  mBlindMetaData; // sorted according to BlindMetaData.order
+    struct Codec { float min, max; uint64_t offset; uint8_t log2; };// used for adaptive bit-rate quantization
+    std::unique_ptr<Codec[]> mCodec;// defines a codec per leaf node when DstBuildT = FpN
+    StatsMode                mStats;
+    CheckMode                mChecksum;
+    bool                     mDitherOn, mIncludeStats, mIncludeTiles;
+    std::vector<uint64_t>    mValIdx[3];// store id of first value in node
+}; // CreateNanoGrid
+
+//================================================================================================
+
+template <typename SrcGridT>
+CreateNanoGrid<SrcGridT>::CreateNanoGrid(const SrcGridT &srcGrid)
+    : mVerbose(0)
+    , mSrcNodeAccPtr(new SrcNodeAccT(srcGrid))
+    , mSrcNodeAcc(*mSrcNodeAccPtr)
+    , mStats(StatsMode::Default)
+    , mChecksum(CheckMode::Default)
+    , mDitherOn(false)
+    , mIncludeStats(true)
+    , mIncludeTiles(true)
+{
+}
+
+//================================================================================================
+
+template <typename SrcGridT>
+CreateNanoGrid<SrcGridT>::CreateNanoGrid(const SrcNodeAccT &srcNodeAcc)
+    : mVerbose(0)
+    , mSrcNodeAccPtr(nullptr)
+    , mSrcNodeAcc(srcNodeAcc)
+    , mStats(StatsMode::Default)
+    , mChecksum(CheckMode::Default)
+    , mDitherOn(false)
+    , mIncludeStats(true)
+    , mIncludeTiles(true)
+{
+}
+
+//================================================================================================
+
+template <typename SrcGridT>
+struct CreateNanoGrid<SrcGridT>::BlindMetaData
+{
+    BlindMetaData(const std::string& name,// name + used to derive GridBlindDataSemantic
+                  const std::string& type,// used to derive GridType of blind data
+                  GridBlindDataClass dataClass,
+                  size_t i, size_t valueCount, size_t valueSize)
+        : metaData(reinterpret_cast<GridBlindMetaData*>(new char[sizeof(GridBlindMetaData)]))
+        , order(i)// sorted id of meta data
+        , size(math::AlignUp<NANOVDB_DATA_ALIGNMENT>(valueCount * valueSize))
+    {
+        util::memzero(metaData, sizeof(GridBlindMetaData));// zero out all meta data
+        if (name.length()>=GridData::MaxNameSize) throw std::runtime_error("blind data name exceeds limit");
+        std::memcpy(metaData->mName, name.c_str(), name.length() + 1);
+        metaData->mValueCount = valueCount;
+        metaData->mSemantic = BlindMetaData::mapToSemantics(name);
+        metaData->mDataClass = dataClass;
+        metaData->mDataType = BlindMetaData::mapToType(type);
+        metaData->mValueSize = valueSize;
+        NANOVDB_ASSERT(metaData->isValid());
+    }
+    BlindMetaData(const std::string& name,// only name
+                  GridBlindDataSemantic dataSemantic,
+                  GridBlindDataClass dataClass,
+                  GridType dataType,
+                  size_t i, size_t valueCount, size_t valueSize)
+        : metaData(reinterpret_cast<GridBlindMetaData*>(new char[sizeof(GridBlindMetaData)]))
+        , order(i)// sorted id of meta data
+        , size(math::AlignUp<NANOVDB_DATA_ALIGNMENT>(valueCount * valueSize))
+    {
+        std::memset(metaData, 0, sizeof(GridBlindMetaData));// zero out all meta data
+        if (name.length()>=GridData::MaxNameSize) throw std::runtime_error("blind data name exceeds character limit");
+        std::memcpy(metaData->mName, name.c_str(), name.length() + 1);
+        metaData->mValueCount = valueCount;
+        metaData->mSemantic = dataSemantic;
+        metaData->mDataClass = dataClass;
+        metaData->mDataType = dataType;
+        metaData->mValueSize = valueSize;
+        NANOVDB_ASSERT(metaData->isValid());
+    }
+    ~BlindMetaData(){ delete [] reinterpret_cast<char*>(metaData); }
+    bool operator<(const BlindMetaData& other) const { return order < other.order; } // required by std::set
+    static GridType mapToType(const std::string& name)
+    {
+        GridType type = GridType::Unknown;
+        if ("uint32_t" == name) {
+            type = GridType::UInt32;
+        } else if ("float" == name) {
+            type = GridType::Float;
+        } else if ("vec3s"== name) {
+            type = GridType::Vec3f;
+        } else if ("int32" == name) {
+            type = GridType::Int32;
+        } else if ("int64" == name) {
+            type = GridType::Int64;
+        }
+        return type;
+    }
+    static GridBlindDataSemantic mapToSemantics(const std::string& name)
+    {
+        GridBlindDataSemantic semantic = GridBlindDataSemantic::Unknown;
+        if ("P" == name) {
+            semantic = GridBlindDataSemantic::PointPosition;
+        } else if ("V" == name) {
+            semantic = GridBlindDataSemantic::PointVelocity;
+        } else if ("Cd" == name) {
+            semantic = GridBlindDataSemantic::PointColor;
+        } else if ("N" == name) {
+            semantic = GridBlindDataSemantic::PointNormal;
+        } else if ("id" == name) {
+            semantic = GridBlindDataSemantic::PointId;
+        }
+        return semantic;
+    }
+    GridBlindMetaData *metaData;
+    const size_t       order, size;
+}; // CreateNanoGrid::BlindMetaData
+
+//================================================================================================
+
+template <typename SrcGridT>
+template<typename DstBuildT, typename BufferT>
+typename util::disable_if<util::is_same<DstBuildT, FpN>::value ||
+                    BuildTraits<DstBuildT>::is_index, GridHandle<BufferT>>::type
+CreateNanoGrid<SrcGridT>::getHandle(const BufferT& pool)
+{
+    this->template preProcess<DstBuildT>();
+    auto handle = this->template initHandle<DstBuildT>(pool);
+    this->template postProcess<DstBuildT>();
+    return handle;
+} // CreateNanoGrid::getHandle<T>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template<typename DstBuildT, typename OracleT, typename BufferT>
+typename util::enable_if<util::is_same<DstBuildT, FpN>::value, GridHandle<BufferT>>::type
+CreateNanoGrid<SrcGridT>::getHandle(const OracleT& oracle, const BufferT& pool)
+{
+    this->template preProcess<DstBuildT, OracleT>(oracle);
+    auto handle = this->template initHandle<DstBuildT>(pool);
+    this->template postProcess<DstBuildT>();
+    return handle;
+} // CreateNanoGrid::getHandle<FpN>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template<typename DstBuildT, typename BufferT>
+typename util::enable_if<BuildTraits<DstBuildT>::is_index, GridHandle<BufferT>>::type
+CreateNanoGrid<SrcGridT>::getHandle(uint32_t channels,
+                                    bool includeStats,
+                                    bool includeTiles,
+                                    const BufferT &pool)
+{
+    mIncludeStats = includeStats;
+    mIncludeTiles = includeTiles;
+    this->template preProcess<DstBuildT>(channels);
+    auto handle = this->template initHandle<DstBuildT>(pool);
+    this->template postProcess<DstBuildT>(channels);
+    return handle;
+}// CreateNanoGrid::getHandle<ValueIndex or ValueOnIndex>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT, typename BufferT>
+GridHandle<BufferT> CreateNanoGrid<SrcGridT>::initHandle(const BufferT& pool)
+{
+    mOffset.grid  = 0;// grid is always stored at the start of the buffer!
+    mOffset.tree  = NanoGrid<DstBuildT>::memUsage(); // grid ends and tree begins
+    mOffset.root  = mOffset.tree  + NanoTree<DstBuildT>::memUsage(); // tree ends and root node begins
+    mOffset.upper = mOffset.root  + NanoRoot<DstBuildT>::memUsage(mSrcNodeAcc.root().getTableSize()); // root node ends and upper internal nodes begin
+    mOffset.lower = mOffset.upper + NanoUpper<DstBuildT>::memUsage()*mSrcNodeAcc.nodeCount(2); // upper internal nodes ends and lower internal nodes begin
+    mOffset.leaf  = mOffset.lower + NanoLower<DstBuildT>::memUsage()*mSrcNodeAcc.nodeCount(1); // lower internal nodes ends and leaf nodes begin
+    mOffset.meta  = mOffset.leaf  + mLeafNodeSize;// leaf nodes end and blind meta data begins
+    mOffset.blind = mOffset.meta  + sizeof(GridBlindMetaData)*mBlindMetaData.size(); // meta data ends and blind data begins
+    mOffset.size  = mOffset.blind;// end of buffer
+    for (const auto& b : mBlindMetaData) mOffset.size += b.size; // accumulate all the blind data
+
+    auto buffer = BufferT::create(mOffset.size, &pool);
+    mBufferPtr = buffer.data();
+
+    // Concurrent processing of all tree levels!
+    util::invoke( [&](){this->template processLeafs<DstBuildT>();},
+                  [&](){this->template processInternalNodes<DstBuildT, 1>();},
+                  [&](){this->template processInternalNodes<DstBuildT, 2>();},
+                  [&](){this->template processRoot<DstBuildT>();},
+                  [&](){this->template processTree<DstBuildT>();},
+                  [&](){this->template processGrid<DstBuildT>();} );
+
+    return GridHandle<BufferT>(std::move(buffer));
+} // CreateNanoGrid::initHandle
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+inline typename util::disable_if<util::is_same<FpN, DstBuildT>::value || BuildTraits<DstBuildT>::is_index>::type
+CreateNanoGrid<SrcGridT>::preProcess()
+{
+    if (const uint64_t pointCount = this->countPoints()) {
+#if defined(NANOVDB_USE_OPENVDB) && !defined(__CUDACC__)
+        if constexpr(util::is_same<openvdb::tools::PointIndexGrid, SrcGridT>::value) {
+            if (!mBlindMetaData.empty()) throw std::runtime_error("expected no blind meta data");
+            this->addBlindData("index",
+                               GridBlindDataSemantic::PointId,
+                               GridBlindDataClass::IndexArray,
+                               GridType::UInt32,
+                               pointCount,
+                               sizeof(uint32_t));
+        } else if constexpr(util::is_same<openvdb::points::PointDataGrid, SrcGridT>::value) {
+            if (!mBlindMetaData.empty()) throw std::runtime_error("expected no blind meta data");
+            auto &srcLeaf = mSrcNodeAcc.template node<0>(0);
+            const auto& attributeSet = srcLeaf.attributeSet();
+            const auto& descriptor = attributeSet.descriptor();
+            const auto& nameMap = descriptor.map();
+            for (auto it = nameMap.begin(); it != nameMap.end(); ++it) {
+                const size_t index = it->second;
+                auto& attArray = srcLeaf.constAttributeArray(index);
+                mBlindMetaData.emplace(it->first, // name used to derive semantics
+                                       descriptor.valueType(index), // type
+                                       it->first == "id" ? GridBlindDataClass::IndexArray : GridBlindDataClass::AttributeArray, // class
+                                       index, // order
+                                       pointCount, // element count
+                                       attArray.valueTypeSize()); // element size
+            }
+        }
+#endif
+    }
+    if (mSrcNodeAcc.hasLongGridName()) {
+        this->addBlindData("grid name",
+                           GridBlindDataSemantic::Unknown,
+                           GridBlindDataClass::GridName,
+                           GridType::Unknown,
+                           mSrcNodeAcc.getName().length() + 1, 1);
+    }
+    mLeafNodeSize = mSrcNodeAcc.nodeCount(0)*NanoLeaf<DstBuildT>::DataType::memUsage();
+}// CreateNanoGrid::preProcess<T>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT, typename OracleT>
+inline typename util::enable_if<util::is_same<FpN, DstBuildT>::value>::type
+CreateNanoGrid<SrcGridT>::preProcess(OracleT oracle)
+{
+    static_assert(util::is_same<float, SrcValueT>::value, "preProcess<FpN>: expected SrcValueT == float");
+
+    const size_t leafCount = mSrcNodeAcc.nodeCount(0);
+    if (leafCount==0) {
+        mLeafNodeSize = 0u;
+        return;
+    }
+    mCodec.reset(new Codec[leafCount]);
+
+    if constexpr(util::is_same<AbsDiff, OracleT>::value) {
+        if (!oracle) oracle.init(mSrcNodeAcc.gridClass(), mSrcNodeAcc.root().background());
+    }
+
+    math::DitherLUT lut(mDitherOn);
+    util::forEach(0, leafCount, 4, [&](const util::Range1D &r) {
+        for (auto i=r.begin(); i!=r.end(); ++i) {
+            const auto &srcLeaf = mSrcNodeAcc.template node<0>(i);
+            float &min = mCodec[i].min = std::numeric_limits<float>::max();
+            float &max = mCodec[i].max = -min;
+            for (int j=0; j<512; ++j) {
+                float v = srcLeaf.getValue(j);
+                if (v<min) min = v;
+                if (v>max) max = v;
+            }
+            const float range = max - min;
+            uint8_t &logBitWidth = mCodec[i].log2 = 0;// 0,1,2,3,4 => 1,2,4,8,16 bits
+            while (range > 0.0f && logBitWidth < 4u) {
+                const uint32_t mask = (uint32_t(1) << (uint32_t(1) << logBitWidth)) - 1u;
+                const float encode  = mask/range;
+                const float decode  = range/mask;
+                int j = 0;
+                do {
+                    const float exact = srcLeaf.getValue(j);//data[j];// exact value
+                    const uint32_t code = uint32_t(encode*(exact - min) + lut(j));
+                    const float approx = code * decode + min;// approximate value
+                    j += oracle(exact, approx) ? 1 : 513;
+                } while(j < 512);
+                if (j == 512) break;
+                ++logBitWidth;
+            }
+        }
+    });
+
+    auto getOffset = [&](size_t i){
+        --i;
+        return mCodec[i].offset +  NanoLeaf<DstBuildT>::DataType::memUsage(1u << mCodec[i].log2);
+    };
+    mCodec[0].offset = NanoGrid<FpN>::memUsage() +
+                       NanoTree<FpN>::memUsage() +
+                       NanoRoot<FpN>::memUsage(mSrcNodeAcc.root().getTableSize()) +
+                       NanoUpper<FpN>::memUsage()*mSrcNodeAcc.nodeCount(2) +
+                       NanoLower<FpN>::memUsage()*mSrcNodeAcc.nodeCount(1);
+    for (size_t i=1; i<leafCount; ++i) mCodec[i].offset = getOffset(i);
+    mLeafNodeSize = getOffset(leafCount);
+
+    if (mVerbose) {
+        uint32_t counters[5+1] = {0};
+        ++counters[mCodec[0].log2];
+        for (size_t i=1; i<leafCount; ++i) ++counters[mCodec[i].log2];
+        std::cout << "\n" << oracle << std::endl;
+        std::cout << "Dithering: " << (mDitherOn ? "enabled" : "disabled") << std::endl;
+        float avg = 0.0f;
+        for (uint32_t i=0; i<=5; ++i) {
+            if (uint32_t n = counters[i]) {
+                avg += n * float(1 << i);
+                printf("%2i bits: %6u leaf nodes, i.e. %4.1f%%\n",1<<i, n, 100.0f*n/float(leafCount));
+            }
+        }
+        printf("%4.1f bits per value on average\n", avg/float(leafCount));
+    }
+
+    if (mSrcNodeAcc.hasLongGridName()) {
+        this->addBlindData("grid name",
+                           GridBlindDataSemantic::Unknown,
+                           GridBlindDataClass::GridName,
+                           GridType::Unknown,
+                           mSrcNodeAcc.getName().length() + 1, 1);
+    }
+}// CreateNanoGrid::preProcess<FpN>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT, int LEVEL>
+inline typename util::enable_if<BuildTraits<DstBuildT>::is_index, uint64_t>::type
+CreateNanoGrid<SrcGridT>::countTileValues(uint64_t valueCount)
+{
+    const uint64_t stats = mIncludeStats ? 4u : 0u;// minimum, maximum, average, and deviation
+    mValIdx[LEVEL].clear();
+    mValIdx[LEVEL].resize(mSrcNodeAcc.nodeCount(LEVEL) + 1, stats);// minimum 1 entry
+    util::forEach(1, mValIdx[LEVEL].size(), 8, [&](const util::Range1D& r){
+        for (auto i = r.begin(); i!=r.end(); ++i) {
+            auto &srcNode = mSrcNodeAcc.template node<LEVEL>(i-1);
+            if constexpr(BuildTraits<DstBuildT>::is_onindex) {// resolved at compile time
+                mValIdx[LEVEL][i] += srcNode.getValueMask().countOn();
+            } else {
+                static const uint64_t maxTileCount = uint64_t(1u) << 3*srcNode.LOG2DIM;
+                mValIdx[LEVEL][i] += maxTileCount - srcNode.getChildMask().countOn();
+            }
+        }
+    });
+    mValIdx[LEVEL][0] = valueCount;
+    for (size_t i=1; i<mValIdx[LEVEL].size(); ++i) mValIdx[LEVEL][i] += mValIdx[LEVEL][i-1];// pre-fixed sum
+    return mValIdx[LEVEL].back();
+}// CreateNanoGrid::countTileValues<ValueIndex or ValueOnIndex>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+inline typename util::enable_if<BuildTraits<DstBuildT>::is_index, uint64_t>::type
+CreateNanoGrid<SrcGridT>::countValues()
+{
+    const uint64_t stats = mIncludeStats ? 4u : 0u;// minimum, maximum, average, and deviation
+    uint64_t valueCount = 1u;// offset 0 corresponds to the background value
+    if (mIncludeTiles) {
+        if constexpr(BuildTraits<DstBuildT>::is_onindex) {
+            for (auto it = mSrcNodeAcc.root().cbeginValueOn(); it; ++it) ++valueCount;
+        } else {
+            for (auto it = mSrcNodeAcc.root().cbeginValueAll(); it; ++it) ++valueCount;
+        }
+        valueCount += stats;// optionally append stats for the root node
+        valueCount = countTileValues<DstBuildT, 2>(valueCount);
+        valueCount = countTileValues<DstBuildT, 1>(valueCount);
+    }
+    mValIdx[0].clear();
+    mValIdx[0].resize(mSrcNodeAcc.nodeCount(0) + 1, 512u + stats);// minimum 1 entry
+    if constexpr(BuildTraits<DstBuildT>::is_onindex) {
+        util::forEach(1, mValIdx[0].size(), 8, [&](const util::Range1D& r) {
+            for (auto i = r.begin(); i != r.end(); ++i) {
+                mValIdx[0][i] = stats;
+                mValIdx[0][i] += mSrcNodeAcc.template node<0>(i-1).getValueMask().countOn();
+            }
+        });
+    }
+    mValIdx[0][0] = valueCount;
+    util::prefixSum(mValIdx[0], true);// inclusive prefix sum
+    return mValIdx[0].back();
+}// CreateNanoGrid::countValues<ValueIndex or ValueOnIndex>()
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+inline typename util::enable_if<BuildTraits<DstBuildT>::is_index>::type
+CreateNanoGrid<SrcGridT>::preProcess(uint32_t channels)
+{
+    const uint64_t valueCount = this->template countValues<DstBuildT>();
+    mLeafNodeSize = mSrcNodeAcc.nodeCount(0)*NanoLeaf<DstBuildT>::DataType::memUsage();
+
+    uint32_t order = mBlindMetaData.size();
+    char str[16];
+    for (uint32_t i=0; i<channels; ++i) {
+        mBlindMetaData.emplace("channel_"+std::to_string(i),
+                               toStr(str, toGridType<SrcValueT>()),
+                               GridBlindDataClass::AttributeArray,
+                               order++,
+                               valueCount,
+                               sizeof(SrcValueT));
+    }
+    if (mSrcNodeAcc.hasLongGridName()) {
+        this->addBlindData("grid name",
+                           GridBlindDataSemantic::Unknown,
+                           GridBlindDataClass::GridName,
+                           GridType::Unknown,
+                           mSrcNodeAcc.getName().length() + 1, 1);
+    }
+}// preProcess<ValueIndex or ValueOnIndex>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+inline typename util::disable_if<BuildTraits<DstBuildT>::is_special>::type
+CreateNanoGrid<SrcGridT>::processLeafs()
+{
+    using DstDataT  = typename NanoLeaf<DstBuildT>::DataType;
+    using DstValueT = typename DstDataT::ValueType;
+    static_assert(DstDataT::FIXED_SIZE, "Expected destination LeafNode<T> to have fixed size");
+    util::forEach(0, mSrcNodeAcc.nodeCount(0), 8, [&](const util::Range1D& r) {
+        auto *dstLeaf = this->template dstNode<DstBuildT,0>(r.begin());
+        for (auto i = r.begin(); i != r.end(); ++i, ++dstLeaf) {
+            auto &srcLeaf = mSrcNodeAcc.template node<0>(i);
+            if (DstDataT::padding()>0u) {
+                util::memzero(dstLeaf, DstDataT::memUsage());
+            } else {
+                dstLeaf->mBBoxDif[0] = dstLeaf->mBBoxDif[1] = dstLeaf->mBBoxDif[2] = 0u;
+                dstLeaf->mFlags = 0u;// enable rendering, no bbox, no stats
+                dstLeaf->mMinimum = dstLeaf->mMaximum = typename DstDataT::ValueType();
+                dstLeaf->mAverage = dstLeaf->mStdDevi = 0;
+            }
+            dstLeaf->mBBoxMin = srcLeaf.origin(); // copy origin of node
+            dstLeaf->mValueMask = srcLeaf.getValueMask(); // copy value mask
+            DstValueT *dst = dstLeaf->mValues;
+            if constexpr(util::is_same<DstValueT, SrcValueT>::value && SrcNodeAccT::IS_OPENVDB) {
+                const SrcValueT *src = srcLeaf.buffer().data();
+                for (auto *end = dst + 512u; dst != end; dst += 4, src += 4) {
+                    dst[0] = src[0]; // copy *all* voxel values in sets of four, i.e. loop-unrolling
+                    dst[1] = src[1];
+                    dst[2] = src[2];
+                    dst[3] = src[3];
+                }
+            } else {
+                for (uint32_t j=0; j<512u; ++j) *dst++ = static_cast<DstValueT>(srcLeaf.getValue(j));
+            }
+        }
+    });
+} // CreateNanoGrid::processLeafs<T>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+inline typename util::enable_if<BuildTraits<DstBuildT>::is_index>::type
+CreateNanoGrid<SrcGridT>::processLeafs()
+{
+    using DstDataT  = typename NanoLeaf<DstBuildT>::DataType;
+    static_assert(DstDataT::FIXED_SIZE, "Expected destination LeafNode<ValueIndex> to have fixed size");
+    static_assert(DstDataT::padding()==0u, "Expected leaf nodes to have no padding");
+
+    util::forEach(0, mSrcNodeAcc.nodeCount(0), 8, [&](const util::Range1D& r) {
+        const uint8_t flags  = mIncludeStats ? 16u : 0u;// 4th bit indicates stats
+        DstDataT *dstLeaf = this->template dstNode<DstBuildT,0>(r.begin());// fixed size
+        for (auto i = r.begin(); i != r.end(); ++i, ++dstLeaf) {
+            auto &srcLeaf = mSrcNodeAcc.template node<0>(i);
+            dstLeaf->mBBoxMin = srcLeaf.origin(); // copy origin of node
+            dstLeaf->mBBoxDif[0] = dstLeaf->mBBoxDif[1] = dstLeaf->mBBoxDif[2] = 0u;
+            dstLeaf->mFlags = flags;
+            dstLeaf->mValueMask = srcLeaf.getValueMask(); // copy value mask
+            dstLeaf->mOffset = mValIdx[0][i];
+            if constexpr(BuildTraits<DstBuildT>::is_onindex) {
+                const uint64_t *w = dstLeaf->mValueMask.words();
+#ifdef USE_OLD_VALUE_ON_INDEX
+                int32_t sum = CountOn(*w++);
+                uint8_t *p = reinterpret_cast<uint8_t*>(&dstLeaf->mPrefixSum), *q = p + 7;
+                for (int j=0; j<7; ++j) {
+                    *p++ = sum & 255u;
+                    *q |= (sum >> 8) << j;
+                    sum += CountOn(*w++);
+                }
+#else
+                uint64_t &prefixSum = dstLeaf->mPrefixSum, sum = util::countOn(*w++);
+                prefixSum = sum;
+                for (int n = 9; n < 55; n += 9) {// n=i*9 where i=1,2,..6
+                    sum += util::countOn(*w++);
+                    prefixSum |= sum << n;// each pre-fixed sum is encoded in 9 bits
+                }
+#endif
+            } else {
+                dstLeaf->mPrefixSum = 0u;
+            }
+            if constexpr(BuildTraits<DstBuildT>::is_indexmask) dstLeaf->mMask = dstLeaf->mValueMask;
+        }
+    });
+} // CreateNanoGrid::processLeafs<ValueIndex or ValueOnIndex>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+inline typename util::enable_if<util::is_same<ValueMask, DstBuildT>::value>::type
+CreateNanoGrid<SrcGridT>::processLeafs()
+{
+    using DstDataT = typename NanoLeaf<ValueMask>::DataType;
+    static_assert(DstDataT::FIXED_SIZE, "Expected destination LeafNode<ValueMask> to have fixed size");
+    util::forEach(0, mSrcNodeAcc.nodeCount(0), 8, [&](const util::Range1D& r) {
+        auto *dstLeaf = this->template dstNode<DstBuildT,0>(r.begin());
+        for (auto i = r.begin(); i != r.end(); ++i, ++dstLeaf) {
+            auto &srcLeaf = mSrcNodeAcc.template node<0>(i);
+            if (DstDataT::padding()>0u) {
+                util::memzero(dstLeaf, DstDataT::memUsage());
+            } else {
+                dstLeaf->mBBoxDif[0] = dstLeaf->mBBoxDif[1] = dstLeaf->mBBoxDif[2] = 0u;
+                dstLeaf->mFlags = 0u;// enable rendering, no bbox, no stats
+                dstLeaf->mPadding[0] = dstLeaf->mPadding[1] = 0u;
+            }
+            dstLeaf->mBBoxMin = srcLeaf.origin(); // copy origin of node
+            dstLeaf->mValueMask = srcLeaf.getValueMask(); // copy value mask
+        }
+    });
+} // CreateNanoGrid::processLeafs<ValueMask>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+inline typename util::enable_if<util::is_same<bool, DstBuildT>::value>::type
+CreateNanoGrid<SrcGridT>::processLeafs()
+{
+    using DstDataT = typename NanoLeaf<bool>::DataType;
+    static_assert(DstDataT::FIXED_SIZE, "Expected destination LeafNode<bool> to have fixed size");
+    util::forEach(0, mSrcNodeAcc.nodeCount(0), 8, [&](const util::Range1D& r) {
+        auto *dstLeaf = this->template dstNode<DstBuildT,0>(r.begin());
+        for (auto i = r.begin(); i != r.end(); ++i, ++dstLeaf) {
+            auto &srcLeaf = mSrcNodeAcc.template node<0>(i);
+            if (DstDataT::padding()>0u) {
+                util::memzero(dstLeaf, DstDataT::memUsage());
+            } else {
+                dstLeaf->mBBoxDif[0] = dstLeaf->mBBoxDif[1] = dstLeaf->mBBoxDif[2] = 0u;
+                dstLeaf->mFlags = 0u;// enable rendering, no bbox, no stats
+            }
+            dstLeaf->mBBoxMin = srcLeaf.origin(); // copy origin of node
+            dstLeaf->mValueMask = srcLeaf.getValueMask(); // copy value mask
+            if constexpr(!util::is_same<bool, SrcBuildT>::value) {
+                for (int j=0; j<512; ++j) dstLeaf->mValues.set(j, static_cast<bool>(srcLeaf.getValue(j)));
+            } else if constexpr(SrcNodeAccT::IS_OPENVDB) {
+                dstLeaf->mValues = *reinterpret_cast<const Mask<3>*>(srcLeaf.buffer().data());
+            } else if constexpr(SrcNodeAccT::IS_NANOVDB) {
+                dstLeaf->mValues = srcLeaf.data()->mValues;
+            } else {// tools::Leaf
+                dstLeaf->mValues = srcLeaf.mValues; // copy value mask
+            }
+        }
+    });
+} // CreateNanoGrid::processLeafs<bool>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+inline typename util::enable_if<BuildTraits<DstBuildT>::is_FpX>::type
+CreateNanoGrid<SrcGridT>::processLeafs()
+{
+    using DstDataT = typename NanoLeaf<DstBuildT>::DataType;
+    static_assert(DstDataT::FIXED_SIZE, "Expected destination LeafNode<Fp4|Fp8|Fp16> to have fixed size");
+    using ArrayT = typename DstDataT::ArrayType;
+    static_assert(util::is_same<float, SrcValueT>::value, "Expected ValueT == float");
+    using FloatT = typename std::conditional<DstDataT::bitWidth()>=16, double, float>::type;// 16 compression and higher requires double
+    static constexpr FloatT UNITS = FloatT((1 << DstDataT::bitWidth()) - 1);// # of unique non-zero values
+    math::DitherLUT lut(mDitherOn);
+
+    util::forEach(0, mSrcNodeAcc.nodeCount(0), 8, [&](const util::Range1D& r) {
+        auto *dstLeaf = this->template dstNode<DstBuildT,0>(r.begin());
+        for (auto i = r.begin(); i != r.end(); ++i, ++dstLeaf) {
+            auto &srcLeaf = mSrcNodeAcc.template node<0>(i);
+            if (DstDataT::padding()>0u) {
+                util::memzero(dstLeaf, DstDataT::memUsage());
+            } else {
+                dstLeaf->mFlags = dstLeaf->mBBoxDif[2] = dstLeaf->mBBoxDif[1] = dstLeaf->mBBoxDif[0] = 0u;
+                dstLeaf->mDev = dstLeaf->mAvg = dstLeaf->mMax = dstLeaf->mMin = 0u;
+            }
+            dstLeaf->mBBoxMin = srcLeaf.origin(); // copy origin of node
+            dstLeaf->mValueMask = srcLeaf.getValueMask(); // copy value mask
+            // compute extrema values
+            float min = std::numeric_limits<float>::max(), max = -min;
+            for (uint32_t j=0; j<512u; ++j) {
+                const float v = srcLeaf.getValue(j);
+                if (v < min) min = v;
+                if (v > max) max = v;
+            }
+            dstLeaf->init(min, max, DstDataT::bitWidth());
+            // perform quantization relative to the values in the current leaf node
+            const FloatT encode = UNITS/(max-min);
+            uint32_t offset = 0;
+            auto quantize = [&]()->ArrayT{
+                const ArrayT tmp = static_cast<ArrayT>(encode * (srcLeaf.getValue(offset) - min) + lut(offset));
+                ++offset;
+                return tmp;
+            };
+            auto *code = reinterpret_cast<ArrayT*>(dstLeaf->mCode);
+            if (util::is_same<Fp4, DstBuildT>::value) {// resolved at compile-time
+                for (uint32_t j=0; j<128u; ++j) {
+                    auto tmp = quantize();
+                    *code++  = quantize() << 4 | tmp;
+                    tmp      = quantize();
+                    *code++  = quantize() << 4 | tmp;
+                }
+            } else {
+                for (uint32_t j=0; j<128u; ++j) {
+                    *code++ = quantize();
+                    *code++ = quantize();
+                    *code++ = quantize();
+                    *code++ = quantize();
+                }
+            }
+        }
+    });
+} // CreateNanoGrid::processLeafs<Fp4, Fp8, Fp16>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+inline typename util::enable_if<util::is_same<FpN, DstBuildT>::value>::type
+CreateNanoGrid<SrcGridT>::processLeafs()
+{
+    static_assert(util::is_same<float, SrcValueT>::value, "Expected SrcValueT == float");
+    math::DitherLUT lut(mDitherOn);
+    util::forEach(0, mSrcNodeAcc.nodeCount(0), 8, [&](const util::Range1D& r) {
+        for (auto i = r.begin(); i != r.end(); ++i) {
+            auto &srcLeaf = mSrcNodeAcc.template node<0>(i);
+            auto *dstLeaf = this->template dstNode<DstBuildT,0>(i);
+            dstLeaf->mBBoxMin = srcLeaf.origin(); // copy origin of node
+            dstLeaf->mBBoxDif[0] = dstLeaf->mBBoxDif[1] = dstLeaf->mBBoxDif[2] = 0u;
+            const uint8_t logBitWidth = mCodec[i].log2;
+            dstLeaf->mFlags = logBitWidth << 5;// pack logBitWidth into 3 MSB of mFlag
+            dstLeaf->mValueMask = srcLeaf.getValueMask(); // copy value mask
+            const float min = mCodec[i].min, max = mCodec[i].max;
+            dstLeaf->init(min, max, uint8_t(1) << logBitWidth);
+            // perform quantization relative to the values in the current leaf node
+            uint32_t offset = 0;
+            float encode = 0.0f;
+            auto quantize = [&]()->uint8_t{
+                const uint8_t tmp = static_cast<uint8_t>(encode * (srcLeaf.getValue(offset) - min) + lut(offset));
+                ++offset;
+                return tmp;
+            };
+            auto *dst = reinterpret_cast<uint8_t*>(dstLeaf+1);
+            switch (logBitWidth) {
+                case 0u: {// 1 bit
+                    encode = 1.0f/(max - min);
+                    for (int j=0; j<64; ++j) {
+                        uint8_t a = 0;
+                        for (int k=0; k<8; ++k) a |= quantize() << k;
+                        *dst++ = a;
+                    }
+                }
+                break;
+                case 1u: {// 2 bits
+                    encode = 3.0f/(max - min);
+                    for (int j=0; j<128; ++j) {
+                        auto a = quantize();
+                        a     |= quantize() << 2;
+                        a     |= quantize() << 4;
+                        *dst++ = quantize() << 6 | a;
+                    }
+                }
+                break;
+                case 2u: {// 4 bits
+                    encode = 15.0f/(max - min);
+                    for (int j=0; j<128; ++j) {
+                        auto a = quantize();
+                        *dst++ = quantize() << 4 | a;
+                        a      = quantize();
+                        *dst++ = quantize() << 4 | a;
+                    }
+                }
+                break;
+                case 3u: {// 8 bits
+                    encode = 255.0f/(max - min);
+                    for (int j=0; j<128; ++j) {
+                        *dst++ = quantize();
+                        *dst++ = quantize();
+                        *dst++ = quantize();
+                        *dst++ = quantize();
+                    }
+                }
+                break;
+                default: {// 16 bits - special implementation using higher bit-precision
+                    auto *dst = reinterpret_cast<uint16_t*>(dstLeaf+1);
+                    const double encode = 65535.0/(max - min);// note that double is required!
+                    for (int j=0; j<128; ++j) {
+                        *dst++ = uint16_t(encode * (srcLeaf.getValue(offset) - min) + lut(offset)); ++offset;
+                        *dst++ = uint16_t(encode * (srcLeaf.getValue(offset) - min) + lut(offset)); ++offset;
+                        *dst++ = uint16_t(encode * (srcLeaf.getValue(offset) - min) + lut(offset)); ++offset;
+                        *dst++ = uint16_t(encode * (srcLeaf.getValue(offset) - min) + lut(offset)); ++offset;
+                    }
+                }
+            }// end switch
+        }
+    });// kernel
+} // CreateNanoGrid::processLeafs<FpN>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT, int LEVEL>
+inline typename util::enable_if<!BuildTraits<DstBuildT>::is_index>::type
+CreateNanoGrid<SrcGridT>::processInternalNodes()
+{
+    using DstNodeT  = typename NanoNode<DstBuildT, LEVEL>::type;
+    using DstValueT = typename DstNodeT::ValueType;
+    using DstChildT = typename NanoNode<DstBuildT, LEVEL-1>::type;
+    static_assert(LEVEL == 1 || LEVEL == 2, "Expected internal node");
+
+    const uint64_t nodeCount = mSrcNodeAcc.nodeCount(LEVEL);
+    if (nodeCount > 0) {// compute and temporarily encode IDs of child nodes
+        uint64_t childCount = 0;
+        auto *dstNode = this->template dstNode<DstBuildT,LEVEL>(0);
+        for (uint64_t i=0; i<nodeCount; ++i) {
+            dstNode[i].mFlags = childCount;
+            childCount += mSrcNodeAcc.template node<LEVEL>(static_cast<uint32_t>(i)).getChildMask().countOn();
+        }
+    }
+
+    util::forEach(0, nodeCount, 4, [&](const util::Range1D& r) {
+        auto *dstNode = this->template dstNode<DstBuildT,LEVEL>(r.begin());
+        for (auto i = r.begin(); i != r.end(); ++i, ++dstNode) {
+            auto &srcNode  = mSrcNodeAcc.template node<LEVEL>(i);
+            uint64_t childID = dstNode->mFlags;
+            if (DstNodeT::DataType::padding()>0u) {
+                util::memzero(dstNode, DstNodeT::memUsage());
+            } else {
+                dstNode->mFlags = 0;// enable rendering, no bbox, no stats
+                dstNode->mMinimum = dstNode->mMaximum = typename DstNodeT::ValueType();
+                dstNode->mAverage = dstNode->mStdDevi = 0;
+            }
+            dstNode->mBBox[0]   = srcNode.origin(); // copy origin of node
+            dstNode->mValueMask = srcNode.getValueMask(); // copy value mask
+            dstNode->mChildMask = srcNode.getChildMask(); // copy child mask
+            for (auto it = srcNode.cbeginChildAll(); it; ++it) {
+                SrcValueT value{}; // default initialization
+                if (it.probeChild(value)) {
+                    DstChildT *dstChild = this->template dstNode<DstBuildT,LEVEL-1>(childID++);// might be Leaf<FpN>
+                    dstNode->setChild(it.pos(), dstChild);
+                } else {
+                    dstNode->setValue(it.pos(), static_cast<DstValueT>(value));
+                }
+            }
+        }
+    });
+} // CreateNanoGrid::processInternalNodes<T>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT, int LEVEL>
+inline typename util::enable_if<BuildTraits<DstBuildT>::is_index>::type
+CreateNanoGrid<SrcGridT>::processInternalNodes()
+{
+    using DstNodeT  = typename NanoNode<DstBuildT, LEVEL>::type;
+    using DstChildT = typename NanoNode<DstBuildT, LEVEL-1>::type;
+    static_assert(LEVEL == 1 || LEVEL == 2, "Expected internal node");
+    static_assert(DstNodeT::DataType::padding()==0u, "Expected internal nodes to have no padding");
+
+    const uint64_t nodeCount = mSrcNodeAcc.nodeCount(LEVEL);
+    if (nodeCount > 0) {// compute and temporarily encode IDs of child nodes
+        uint64_t childCount = 0;
+        auto *dstNode = this->template dstNode<DstBuildT,LEVEL>(0);
+        for (uint64_t i=0; i<nodeCount; ++i) {
+            dstNode[i].mFlags = childCount;
+            childCount += mSrcNodeAcc.template node<LEVEL>(i).getChildMask().countOn();
+        }
+    }
+
+    util::forEach(0, nodeCount, 4, [&](const util::Range1D& r) {
+        auto *dstNode = this->template dstNode<DstBuildT,LEVEL>(r.begin());
+        for (auto i = r.begin(); i != r.end(); ++i, ++dstNode) {
+            auto &srcNode  = mSrcNodeAcc.template node<LEVEL>(i);
+            uint64_t childID = dstNode->mFlags;
+            dstNode->mFlags = 0u;
+            dstNode->mBBox[0]   = srcNode.origin(); // copy origin of node
+            dstNode->mValueMask = srcNode.getValueMask(); // copy value mask
+            dstNode->mChildMask = srcNode.getChildMask(); // copy child mask
+            uint64_t n = mIncludeTiles ? mValIdx[LEVEL][i] : 0u;
+            for (auto it = srcNode.cbeginChildAll(); it; ++it) {
+                SrcValueT value;
+                if (it.probeChild(value)) {
+                    DstChildT *dstChild = this->template dstNode<DstBuildT,LEVEL-1>(childID++);// might be Leaf<FpN>
+                    dstNode->setChild(it.pos(), dstChild);
+                } else {
+                    uint64_t m = 0u;
+                    if (mIncludeTiles && !((BuildTraits<DstBuildT>::is_onindex) && dstNode->mValueMask.isOff(it.pos()))) m = n++;
+                    dstNode->setValue(it.pos(), m);
+                }
+            }
+            if (mIncludeTiles && mIncludeStats) {// stats are always placed after the tile values
+                dstNode->mMinimum = n++;
+                dstNode->mMaximum = n++;
+                dstNode->mAverage = n++;
+                dstNode->mStdDevi = n++;
+            } else {// if not tiles or stats set stats to the background offset
+                dstNode->mMinimum = 0u;
+                dstNode->mMaximum = 0u;
+                dstNode->mAverage = 0u;
+                dstNode->mStdDevi = 0u;
+            }
+        }
+    });
+} // CreateNanoGrid::processInternalNodes<ValueIndex or ValueOnIndex>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+inline typename util::enable_if<!BuildTraits<DstBuildT>::is_index>::type
+CreateNanoGrid<SrcGridT>::processRoot()
+{
+    using DstRootT  = NanoRoot<DstBuildT>;
+    using DstValueT = typename DstRootT::ValueType;
+    auto &srcRoot = mSrcNodeAcc.root();
+    auto *dstRoot = this->template dstRoot<DstBuildT>();
+    const uint32_t tableSize = srcRoot.getTableSize();
+    if (DstRootT::DataType::padding()>0) util::memzero(dstRoot, DstRootT::memUsage(tableSize));
+    dstRoot->mTableSize = tableSize;
+    dstRoot->mMinimum = dstRoot->mMaximum = dstRoot->mBackground = srcRoot.background();
+    dstRoot->mBBox = CoordBBox(); // // set to an empty bounding box
+    if (tableSize==0) return;
+    auto *dstChild = this->template dstNode<DstBuildT, 2>(0);// fixed size and linear in memory
+    auto *dstTile  = dstRoot->tile(0);// fixed size and linear in memory
+    for (auto it = srcRoot.cbeginChildAll(); it; ++it, ++dstTile) {
+        SrcValueT value;
+        if (it.probeChild(value)) {
+            dstTile->setChild(it.getCoord(), dstChild++, dstRoot);
+        } else {
+            dstTile->setValue(it.getCoord(), it.isValueOn(), static_cast<DstValueT>(value));
+        }
+    }
+} // CreateNanoGrid::processRoot<T>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+inline typename util::enable_if<BuildTraits<DstBuildT>::is_index>::type
+CreateNanoGrid<SrcGridT>::processRoot()
+{
+    using DstRootT  = NanoRoot<DstBuildT>;
+    auto &srcRoot = mSrcNodeAcc.root();
+    auto *dstRoot = this->template dstRoot<DstBuildT>();
+    const uint32_t tableSize = srcRoot.getTableSize();
+    if (DstRootT::DataType::padding()>0) util::memzero(dstRoot, DstRootT::memUsage(tableSize));
+    dstRoot->mTableSize = tableSize;
+    dstRoot->mBackground = 0u;
+    uint64_t valueCount = 0u;// the first entry is always the background value
+    dstRoot->mBBox = CoordBBox(); // set to an empty/invalid bounding box
+
+    if (tableSize>0) {
+        auto *dstChild = this->template dstNode<DstBuildT, 2>(0);// fixed size and linear in memory
+        auto *dstTile  = dstRoot->tile(0);// fixed size and linear in memory
+        for (auto it = srcRoot.cbeginChildAll(); it; ++it, ++dstTile) {
+            SrcValueT tmp;
+            if (it.probeChild(tmp)) {
+                dstTile->setChild(it.getCoord(), dstChild++, dstRoot);
+            } else {
+                dstTile->setValue(it.getCoord(), it.isValueOn(), 0u);
+                if (mIncludeTiles && !((BuildTraits<DstBuildT>::is_onindex) && !dstTile->state)) dstTile->value = ++valueCount;
+            }
+        }
+    }
+    if (mIncludeTiles && mIncludeStats) {// stats are always placed after the tile values
+        dstRoot->mMinimum = ++valueCount;
+        dstRoot->mMaximum = ++valueCount;
+        dstRoot->mAverage = ++valueCount;
+        dstRoot->mStdDevi = ++valueCount;
+    } else if (dstRoot->padding()==0) {
+        dstRoot->mMinimum = 0u;
+        dstRoot->mMaximum = 0u;
+        dstRoot->mAverage = 0u;
+        dstRoot->mStdDevi = 0u;
+    }
+} // CreateNanoGrid::processRoot<ValueIndex or ValueOnIndex>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+void CreateNanoGrid<SrcGridT>::processTree()
+{
+    const uint64_t nodeCount[3] = {mSrcNodeAcc.nodeCount(0), mSrcNodeAcc.nodeCount(1), mSrcNodeAcc.nodeCount(2)};
+    auto *dstTree = this->template dstTree<DstBuildT>();
+    dstTree->setRoot( this->template dstRoot<DstBuildT>() );
+    dstTree->setFirstNode(nodeCount[2] ? this->template dstNode<DstBuildT, 2>(0) : nullptr);
+    dstTree->setFirstNode(nodeCount[1] ? this->template dstNode<DstBuildT, 1>(0) : nullptr);
+    dstTree->setFirstNode(nodeCount[0] ? this->template dstNode<DstBuildT, 0>(0) : nullptr);
+
+    dstTree->mNodeCount[0] = static_cast<uint32_t>(nodeCount[0]);
+    dstTree->mNodeCount[1] = static_cast<uint32_t>(nodeCount[1]);
+    dstTree->mNodeCount[2] = static_cast<uint32_t>(nodeCount[2]);
+
+    // Count number of active leaf level tiles
+    dstTree->mTileCount[0] = util::reduce(util::Range1D(0,nodeCount[1]), uint32_t(0), [&](util::Range1D &r, uint32_t sum){
+        for (auto i=r.begin(); i!=r.end(); ++i) sum += mSrcNodeAcc.template node<1>(i).getValueMask().countOn();
+        return sum;}, std::plus<uint32_t>());
+
+    // Count number of active lower internal node tiles
+    dstTree->mTileCount[1] = util::reduce(util::Range1D(0,nodeCount[2]), uint32_t(0), [&](util::Range1D &r, uint32_t sum){
+        for (auto i=r.begin(); i!=r.end(); ++i) sum += mSrcNodeAcc.template node<2>(i).getValueMask().countOn();
+        return sum;}, std::plus<uint32_t>());
+
+    // Count number of active upper internal node tiles
+    dstTree->mTileCount[2] = 0;
+    for (auto it = mSrcNodeAcc.root().cbeginValueOn(); it; ++it) dstTree->mTileCount[2] += 1;
+
+    // Count number of active voxels
+    dstTree->mVoxelCount = util::reduce(util::Range1D(0, nodeCount[0]), uint64_t(0), [&](util::Range1D &r, uint64_t sum){
+        for (auto i=r.begin(); i!=r.end(); ++i) sum += mSrcNodeAcc.template node<0>(i).getValueMask().countOn();
+        return sum;}, std::plus<uint64_t>());
+
+    dstTree->mVoxelCount += uint64_t(dstTree->mTileCount[0]) <<  9;// = 3 * 3
+    dstTree->mVoxelCount += uint64_t(dstTree->mTileCount[1]) << 21;// = 3 * (3+4)
+    dstTree->mVoxelCount += uint64_t(dstTree->mTileCount[2]) << 36;// = 3 * (3+4+5)
+
+} // CreateNanoGrid::processTree
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+void CreateNanoGrid<SrcGridT>::processGrid()
+{
+    auto* dstGrid = this->template dstGrid<DstBuildT>();
+    dstGrid->init({GridFlags::IsBreadthFirst}, mOffset.size, mSrcNodeAcc.map(),
+                  toGridType<DstBuildT>(), toGridClass<DstBuildT>(mSrcNodeAcc.gridClass()));
+    dstGrid->mBlindMetadataCount = static_cast<uint32_t>(mBlindMetaData.size());
+    dstGrid->mData1 = this->valueCount();
+
+//    if (!isValid(dstGrid->mGridType, dstGrid->mGridClass)) {
+//#if 1
+//        char str[30];
+//        fprintf(stderr,"Warning: Strange combination of GridType(\"%s\") and GridClass(\"%s\"). Consider changing GridClass to \"Unknown\"\n",
+//                toStr(str, dstGrid->mGridType), toStr(str + 15, dstGrid->mGridClass));
+//#else
+//        throw std::runtime_error("Invalid combination of GridType("+std::to_string(int(dstGrid->mGridType))+
+//                                 ") and GridClass("+std::to_string(int(dstGrid->mGridClass))+"). See NanoVDB.h for details!");
+//#endif
+//    }
+    util::memzero(dstGrid->mGridName, GridData::MaxNameSize);// initialize mGridName to zero
+    strncpy(dstGrid->mGridName, mSrcNodeAcc.getName().c_str(), GridData::MaxNameSize-1);
+    if (mSrcNodeAcc.hasLongGridName()) dstGrid->setLongGridNameOn();// grid name is long so store it as blind data
+
+    // Partially process blind meta data - they will be complete in postProcess
+    if (mBlindMetaData.size()>0) {
+        auto *metaData = this->dstMeta(0);
+        dstGrid->mBlindMetadataOffset = util::PtrDiff(metaData, dstGrid);
+        dstGrid->mBlindMetadataCount = static_cast<uint32_t>(mBlindMetaData.size());
+        char *blindData = util::PtrAdd<char>(mBufferPtr, mOffset.blind);
+        for (const auto &b : mBlindMetaData) {
+            std::memcpy(metaData, b.metaData, sizeof(GridBlindMetaData));
+            metaData->setBlindData(blindData);// sets metaData.mOffset
+            if (metaData->mDataClass == GridBlindDataClass::GridName) strcpy(blindData, mSrcNodeAcc.getName().c_str());
+            ++metaData;
+            blindData += b.size;
+        }
+        mBlindMetaData.clear();
+    }
+} // CreateNanoGrid::processGrid
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+inline typename util::disable_if<BuildTraits<DstBuildT>::is_index>::type
+CreateNanoGrid<SrcGridT>::postProcess()
+{
+    if constexpr(util::is_same<FpN, DstBuildT>::value) mCodec.reset();
+    auto *dstGrid = this->template dstGrid<DstBuildT>();
+    updateGridStats(dstGrid, mStats);
+#if defined(NANOVDB_USE_OPENVDB) && !defined(__CUDACC__)
+    auto *metaData = this->dstMeta(0);
+    if constexpr(util::is_same<openvdb::tools::PointIndexGrid, SrcGridT>::value ||
+                 util::is_same<openvdb::points::PointDataGrid, SrcGridT>::value) {
+        static_assert(util::is_same<DstBuildT, uint32_t>::value, "expected DstBuildT==uint32_t");
+        auto *dstData0 = this->template dstNode<DstBuildT,0>(0)->data();
+        dstData0->mMinimum = 0; // start of prefix sum
+        dstData0->mMaximum = dstData0->mValues[511u];
+        for (uint64_t i=1, n=mSrcNodeAcc.nodeCount(0); i<n; ++i) {
+            auto *dstData1 = dstData0 + 1;
+            dstData1->mMinimum = dstData0->mMinimum + dstData0->mMaximum;
+            dstData1->mMaximum = dstData1->mValues[511u];
+            dstData0 = dstData1;
+        }
+        for (size_t i = 0, n = dstGrid->blindDataCount(); i < n; ++i, ++metaData) {
+            if constexpr(util::is_same<openvdb::tools::PointIndexGrid, SrcGridT>::value) {
+                if (metaData->mDataClass != GridBlindDataClass::IndexArray) continue;
+                if (metaData->mDataType == GridType::UInt32) {
+                    uint32_t *blindData = const_cast<uint32_t*>(metaData->template getBlindData<uint32_t>());
+                    util::forEach(0, mSrcNodeAcc.nodeCount(0), 16, [&](const auto& r) {
+                        auto *dstLeaf = this->template dstNode<DstBuildT,0>(r.begin());
+                        for (auto j = r.begin(); j != r.end(); ++j, ++dstLeaf) {
+                            uint32_t* p = blindData + dstLeaf->mMinimum;
+                            for (uint32_t idx : mSrcNodeAcc.template node<0>(j).indices()) *p++ = idx;
+                        }
+                    });
+                }
+            } else {// if constexpr(util::is_same<openvdb::points::PointDataGrid, SrcGridT>::value)
+                if (metaData->mDataClass != GridBlindDataClass::AttributeArray) continue;
+                if (auto *blindData = dstGrid->template getBlindData<float>(i)) {
+                    this->template copyPointAttribute<DstBuildT>(i, blindData);
+                } else if (auto *blindData = dstGrid->template getBlindData<nanovdb::Vec3f>(i)) {
+                    this->template copyPointAttribute<DstBuildT>(i, reinterpret_cast<openvdb::Vec3f*>(blindData));
+                } else if (auto *blindData = dstGrid->template getBlindData<int32_t>(i)) {
+                    this->template copyPointAttribute<DstBuildT>(i, blindData);
+                } else if (auto *blindData = dstGrid->template getBlindData<int64_t>(i)) {
+                    this->template copyPointAttribute<DstBuildT>(i, blindData);
+                } else {
+                    char str[16];
+                    std::cerr << "unsupported point attribute \"" << toStr(str, metaData->mDataType) << "\"\n";
+                }
+            }// if
+        }// loop
+    } else { // if
+        (void)metaData;
+    }
+#endif
+    updateChecksum(dstGrid, mChecksum);
+}// CreateNanoGrid::postProcess<T>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+inline typename util::enable_if<BuildTraits<DstBuildT>::is_index>::type
+CreateNanoGrid<SrcGridT>::postProcess(uint32_t channels)
+{
+    char str[16];
+    const std::string typeName = toStr(str, toGridType<SrcValueT>());
+    const uint64_t valueCount = this->valueCount();
+    auto *dstGrid = this->template dstGrid<DstBuildT>();
+    for (uint32_t i=0; i<channels; ++i) {
+        const std::string name = "channel_"+std::to_string(i);
+        int j = dstGrid->findBlindData(name.c_str());
+        if (j<0) throw std::runtime_error("missing " + name);
+        auto *metaData = this->dstMeta(j);// partially set in processGrid
+        metaData->mDataClass = GridBlindDataClass::ChannelArray;
+        metaData->mDataType  = toGridType<SrcValueT>();
+        SrcValueT *blindData = const_cast<SrcValueT*>(metaData->template getBlindData<SrcValueT>());
+        if (i>0) {// concurrent copy from previous channel
+            util::forEach(0,valueCount,1024,[&](const util::Range1D &r){
+                SrcValueT *dst=blindData+r.begin(), *end=dst+r.size(), *src=dst-valueCount;
+                while(dst!=end) *dst++ = *src++;
+            });
+        } else {
+            this->template copyValues<DstBuildT>(blindData);
+        }
+    }// loop over channels
+    updateGridStats(this->template dstGrid<DstBuildT>(), std::min(StatsMode::BBox, mStats));
+    updateChecksum(dstGrid, mChecksum);
+}// CreateNanoGrid::postProcess<ValueIndex or ValueOnIndex>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+typename util::enable_if<BuildTraits<DstBuildT>::is_index>::type
+CreateNanoGrid<SrcGridT>::copyValues(SrcValueT *buffer)
+{// copy values from the source grid into the provided buffer
+    assert(mBufferPtr && buffer);
+    using StatsT = typename FloatTraits<SrcValueT>::FloatType;
+
+    if (this->valueCount()==0) this->template countValues<DstBuildT>();
+
+    auto copyNodeValues = [&](const auto &node, SrcValueT *v) {
+        if constexpr(BuildTraits<DstBuildT>::is_onindex) {
+            for (auto it = node.cbeginValueOn(); it; ++it) *v++ = *it;
+        } else {
+            for (auto it = node.cbeginValueAll(); it; ++it) *v++ = *it;
+        }
+        if (mIncludeStats) {
+            if constexpr(SrcNodeAccT::IS_NANOVDB) {// resolved at compile time
+                *v++ = node.minimum();
+                *v++ = node.maximum();
+                if constexpr(util::is_same<SrcValueT, StatsT>::value) {
+                    *v++ = node.average();
+                    *v++ = node.stdDeviation();
+                } else {// eg when SrcValueT=Vec3f and StatsT=float
+                    *v++ = SrcValueT(node.average());
+                    *v++ = SrcValueT(node.stdDeviation());
+                }
+            } else {// openvdb and nanovdb::tools::build::Grid have no stats
+                *v++ = buffer[0];// background
+                *v++ = buffer[0];// background
+                *v++ = buffer[0];// background
+                *v++ = buffer[0];// background
+            }
+        }
+    };// copyNodeValues
+
+    const SrcRootT &root = mSrcNodeAcc.root();
+    buffer[0] = root.background();// Value array always starts with the background value
+    if (mIncludeTiles) {
+        copyNodeValues(root, buffer + 1u);
+        util::forEach(0, mSrcNodeAcc.nodeCount(2), 1, [&](const util::Range1D& r) {
+            for (auto i = r.begin(); i!=r.end(); ++i) {
+                copyNodeValues(mSrcNodeAcc.template node<2>(i), buffer + mValIdx[2][i]);
+            }
+        });
+        util::forEach(0, mSrcNodeAcc.nodeCount(1), 1, [&](const util::Range1D& r) {
+            for (auto i = r.begin(); i!=r.end(); ++i) {
+                copyNodeValues(mSrcNodeAcc.template node<1>(i), buffer + mValIdx[1][i]);
+            }
+        });
+    }
+    util::forEach(0, mSrcNodeAcc.nodeCount(0), 4, [&](const util::Range1D& r) {
+        for (auto i = r.begin(); i!=r.end(); ++i) {
+            copyNodeValues(mSrcNodeAcc.template node<0>(i), buffer + mValIdx[0][i]);
+        }
+    });
+}// CreateNanoGrid::copyValues<ValueIndex or ValueOnIndex>
+
+
+//================================================================================================
+
+#if defined(NANOVDB_USE_OPENVDB) && !defined(__CUDACC__)
+
+template <typename SrcGridT>
+template<typename T>
+typename util::disable_if<util::is_same<T, openvdb::tools::PointIndexGrid>::value ||
+                    util::is_same<T, openvdb::points::PointDataGrid>::value, uint64_t>::type
+CreateNanoGrid<SrcGridT>::countPoints() const
+{
+    static_assert(util::is_same<T, SrcGridT>::value, "expected default template parameter");
+    return 0u;
+}// CreateNanoGrid::countPoints<T>
+
+template <typename SrcGridT>
+template<typename T>
+typename util::enable_if<util::is_same<T, openvdb::tools::PointIndexGrid>::value ||
+                   util::is_same<T, openvdb::points::PointDataGrid>::value, uint64_t>::type
+CreateNanoGrid<SrcGridT>::countPoints() const
+{
+    static_assert(util::is_same<T, SrcGridT>::value, "expected default template parameter");
+    return util::reduce(0, mSrcNodeAcc.nodeCount(0), 8, uint64_t(0), [&](auto &r, uint64_t sum) {
+        for (auto i=r.begin(); i!=r.end(); ++i) sum += mSrcNodeAcc.template node<0>(i).getLastValue();
+        return sum;}, std::plus<uint64_t>());
+}// CreateNanoGrid::countPoints<PointIndexGrid or PointDataGrid>
+
+template <typename SrcGridT>
+template<typename DstBuildT, typename AttT, typename CodecT, typename T>
+typename util::enable_if<util::is_same<openvdb::points::PointDataGrid, T>::value>::type
+CreateNanoGrid<SrcGridT>::copyPointAttribute(size_t attIdx, AttT *attPtr)
+{
+    static_assert(util::is_same<SrcGridT, T>::value, "Expected default parameter");
+    using HandleT = openvdb::points::AttributeHandle<AttT, CodecT>;
+    util::forEach(0, mSrcNodeAcc.nodeCount(0), 16, [&](const auto& r) {
+        auto *dstLeaf = this->template dstNode<DstBuildT,0>(r.begin());
+        for (auto i = r.begin(); i != r.end(); ++i, ++dstLeaf) {
+            auto& srcLeaf = mSrcNodeAcc.template node<0>(i);
+            HandleT handle(srcLeaf.constAttributeArray(attIdx));
+            AttT *p = attPtr + dstLeaf->mMinimum;
+            for (auto iter = srcLeaf.beginIndexOn(); iter; ++iter) *p++ = handle.get(*iter);
+        }
+    });
+}// CreateNanoGrid::copyPointAttribute
+
+#endif
+
+//================================================================================================
+
+template<typename SrcGridT, typename DstBuildT, typename BufferT>
+typename util::disable_if<BuildTraits<DstBuildT>::is_index || BuildTraits<DstBuildT>::is_Fp, GridHandle<BufferT>>::type
+createNanoGrid(const SrcGridT &srcGrid,
+               StatsMode sMode,
+               CheckMode cMode,
+               int verbose,
+               const BufferT &buffer)
+{
+    CreateNanoGrid<SrcGridT> converter(srcGrid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    converter.setVerbose(verbose);
+    return converter.template getHandle<DstBuildT, BufferT>(buffer);
+}// createNanoGrid<T>
+
+//================================================================================================
+
+template<typename SrcGridT, typename DstBuildT, typename BufferT>
+typename util::enable_if<BuildTraits<DstBuildT>::is_index, GridHandle<BufferT>>::type
+createNanoGrid(const SrcGridT &srcGrid,
+               uint32_t channels,
+               bool includeStats,
+               bool includeTiles,
+               int verbose,
+               const BufferT &buffer)
+{
+    CreateNanoGrid<SrcGridT> converter(srcGrid);
+    converter.setVerbose(verbose);
+    return converter.template getHandle<DstBuildT, BufferT>(channels, includeStats, includeTiles, buffer);
+}
+
+//================================================================================================
+
+template<typename SrcGridT, typename DstBuildT, typename OracleT, typename BufferT>
+typename util::enable_if<util::is_same<FpN, DstBuildT>::value, GridHandle<BufferT>>::type
+createNanoGrid(const SrcGridT &srcGrid,
+               StatsMode sMode,
+               CheckMode cMode,
+               bool ditherOn,
+               int verbose,
+               const OracleT &oracle,
+               const BufferT &buffer)
+{
+    CreateNanoGrid<SrcGridT> converter(srcGrid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    converter.enableDithering(ditherOn);
+    converter.setVerbose(verbose);
+    return converter.template getHandle<DstBuildT, OracleT, BufferT>(oracle, buffer);
+}// createNanoGrid<FpN>
+
+//================================================================================================
+
+template<typename SrcGridT, typename DstBuildT, typename BufferT>
+typename util::enable_if<BuildTraits<DstBuildT>::is_FpX, GridHandle<BufferT>>::type
+createNanoGrid(const SrcGridT &srcGrid,
+               StatsMode sMode,
+               CheckMode cMode,
+               bool ditherOn,
+               int verbose,
+               const BufferT &buffer)
+{
+    CreateNanoGrid<SrcGridT> converter(srcGrid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    converter.enableDithering(ditherOn);
+    converter.setVerbose(verbose);
+    return converter.template getHandle<DstBuildT, BufferT>(buffer);
+}// createNanoGrid<Fp4,8,16>
+
+//================================================================================================
+
+#if defined(NANOVDB_USE_OPENVDB) && !defined(__CUDACC__)
+template<typename BufferT>
+GridHandle<BufferT>
+openToNanoVDB(const openvdb::GridBase::Ptr& base,
+              StatsMode                     sMode,
+              CheckMode                  cMode,
+              int                           verbose)
+{
+    // We need to define these types because they are not defined in OpenVDB
+    using openvdb_Vec4fTree = typename openvdb::tree::Tree4<openvdb::Vec4f, 5, 4, 3>::Type;
+    using openvdb_Vec4dTree = typename openvdb::tree::Tree4<openvdb::Vec4d, 5, 4, 3>::Type;
+    using openvdb_Vec4fGrid = openvdb::Grid<openvdb_Vec4fTree>;
+    using openvdb_Vec4dGrid = openvdb::Grid<openvdb_Vec4dTree>;
+    using openvdb_UInt32Grid = openvdb::Grid<openvdb::UInt32Tree>;
+
+    if (auto grid = openvdb::GridBase::grid<openvdb::FloatGrid>(base)) {
+        return createNanoGrid<openvdb::FloatGrid, float, BufferT>(*grid, sMode, cMode, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb::DoubleGrid>(base)) {
+        return createNanoGrid<openvdb::DoubleGrid, double, BufferT>(*grid, sMode, cMode, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb::Int32Grid>(base)) {
+        return createNanoGrid<openvdb::Int32Grid, int32_t,BufferT>(*grid, sMode, cMode, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb::Int64Grid>(base)) {
+        return createNanoGrid<openvdb::Int64Grid, int64_t, BufferT>(*grid, sMode, cMode, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb_UInt32Grid>(base)) {
+        return createNanoGrid<openvdb_UInt32Grid, uint32_t, BufferT>(*grid, sMode, cMode, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb::Vec3fGrid>(base)) {
+        return createNanoGrid<openvdb::Vec3fGrid, nanovdb::Vec3f, BufferT>(*grid, sMode, cMode, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb::Vec3dGrid>(base)) {
+        return createNanoGrid<openvdb::Vec3dGrid, nanovdb::Vec3d, BufferT>(*grid, sMode, cMode, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb::tools::PointIndexGrid>(base)) {
+        return createNanoGrid<openvdb::tools::PointIndexGrid, uint32_t, BufferT>(*grid, sMode, cMode, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb::points::PointDataGrid>(base)) {
+        return createNanoGrid<openvdb::points::PointDataGrid, uint32_t, BufferT>(*grid, sMode, cMode, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb::MaskGrid>(base)) {
+        return createNanoGrid<openvdb::MaskGrid, nanovdb::ValueMask, BufferT>(*grid, sMode, cMode, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb::BoolGrid>(base)) {
+        return createNanoGrid<openvdb::BoolGrid, bool, BufferT>(*grid, sMode, cMode, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb_Vec4fGrid>(base)) {
+        return createNanoGrid<openvdb_Vec4fGrid, nanovdb::Vec4f, BufferT>(*grid, sMode, cMode, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb_Vec4dGrid>(base)) {
+        return createNanoGrid<openvdb_Vec4dGrid, nanovdb::Vec4d, BufferT>(*grid, sMode, cMode, verbose);
+    } else {
+        OPENVDB_THROW(openvdb::RuntimeError, "Unrecognized OpenVDB grid type");
+    }
+}// openToNanoVDB
+#endif
+
+}// namespace tools ===============================================================================
+
+} // namespace nanovdb
+
+#endif // NANOVDB_TOOLS_CREATENANOGRID_H_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/tools/CreatePrimitives.h b/nanovdb/nanovdb/tools/CreatePrimitives.h
new file mode 100644
index 0000000000..a28d5bacd4
--- /dev/null
+++ b/nanovdb/nanovdb/tools/CreatePrimitives.h
@@ -0,0 +1,1752 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    \file nanovdb/tools/CreatePrimitives.h
+
+    \author Ken Museth
+
+    \date June 26, 2020
+
+    \brief Generates volumetric primitives, e.g. sphere, torus etc, as NanoVDB grid.
+
+    \note This has no dependency on openvdb.
+*/
+
+#ifndef NANOVDB_TOOLS_PRIMITIVES_H_HAS_BEEN_INCLUDED
+#define NANOVDB_TOOLS_PRIMITIVES_H_HAS_BEEN_INCLUDED
+
+#define NANOVDB_PARALLEL_PRIMITIVES
+
+#include <nanovdb/NanoVDB.h>
+#include <nanovdb/tools/CreateNanoGrid.h>
+#include <nanovdb/util/ForEach.h>// for util::forEach and util::Range
+
+namespace nanovdb {
+
+namespace tools {// ===================================================
+
+/// @brief Returns a handle to a narrow-band level set of a sphere
+///
+/// @param radius    Radius of sphere in world units
+/// @param center    Center of sphere in world units
+/// @param voxelSize Size of a voxel in world units
+/// @param halfWidth Half-width of narrow band in voxel units
+/// @param origin    Origin of grid in world units
+/// @param name      Name of the grid
+/// @param sMode     Mode of computation for the statistics.
+/// @param cMode     Mode of computation for the checksum.
+/// @param tolerance Global error tolerance use when VoxelT = FpN
+/// @param ditherOn  If true dithering will be applied when VoxelT = {Fp4,Fp8,Fp16,FpN}
+/// @param buffer    Buffer used for memory allocation by the handle
+///
+/// @details The @c BuildT template parameter must be one of the following:
+///          float (default), double, Fp4, Fp8, Fp16 or FpN. The @c tolerance
+///          argument is only used when BuildT is set to FpN.
+template<typename BuildT = float, typename BufferT = HostBuffer>
+typename util::enable_if<util::is_same<BuildT, float,  double>::value, GridHandle<BufferT>>::type
+createLevelSetSphere(double              radius = 100.0,
+                     const Vec3d&        center = Vec3d(0),
+                     double              voxelSize = 1.0,
+                     double              halfWidth = 3.0,
+                     const Vec3d&        origin = Vec3d(0),
+                     const std::string&  name = "sphere_ls",
+                     StatsMode           sMode = StatsMode::Default,
+                     CheckMode           cMode = CheckMode::Default,
+                     const BufferT&      buffer = BufferT());
+
+template<typename BuildT, typename BufferT = HostBuffer>
+typename util::enable_if<util::is_same<BuildT, Fp4, Fp8, Fp16>::value, GridHandle<BufferT>>::type
+createLevelSetSphere(double              radius = 100.0,
+                     const Vec3d&        center = Vec3d(0),
+                     double              voxelSize = 1.0,
+                     double              halfWidth = 3.0,
+                     const Vec3d&        origin = Vec3d(0),
+                     const std::string&  name = "sphere_ls",
+                     StatsMode           sMode = StatsMode::Default,
+                     CheckMode           cMode = CheckMode::Default,
+                     bool                ditherOn = false,
+                     const BufferT&      buffer = BufferT());
+
+template<typename BuildT, typename BufferT = HostBuffer>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetSphere(double              radius = 100.0,
+                     const Vec3d&        center = Vec3d(0),
+                     double              voxelSize = 1.0,
+                     double              halfWidth = 3.0,
+                     const Vec3d&        origin = Vec3d(0),
+                     const std::string&  name = "sphere_ls_FpN",
+                     StatsMode           sMode = StatsMode::Default,
+                     CheckMode           cMode = CheckMode::Default,
+                     float               tolerance = -1.0f,
+                     bool                ditherOn = false,
+                     const BufferT&      buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Returns a handle to a sparse fog volume of a sphere such
+///        that the exterior is 0 and inactive, the interior is active
+///        with values varying smoothly from 0 at the surface of the
+///        sphere to 1 at the halfWidth and interior of the sphere.
+///
+/// @param radius    Radius of sphere in world units
+/// @param center    Center of sphere in world units
+/// @param voxelSize Size of a voxel in world units
+/// @param halfWidth Half-width of narrow band in voxel units
+/// @param origin    Origin of grid in world units
+/// @param name      Name of the grid
+/// @param sMode     Mode of computation for the statistics.
+/// @param cMode     Mode of computation for the checksum.
+/// @param tolerance Global error tolerance use when VoxelT = FpN
+/// @param ditherOn  If true dithering will be applied when BuildT = {Fp4,Fp8,Fp16,FpN}
+/// @param buffer    Buffer used for memory allocation by the handle
+///
+/// @details The @c BuildT template parameter must be one of the following:
+///          float (default), double, Fp4, Fp8, Fp16 or FpN. The @c tolerance
+///          argument is only used when BuildT is set to FpN.
+template<typename BuildT = float, typename BufferT = HostBuffer>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeSphere(double              radius = 100.0,
+                      const Vec3d&        center = Vec3d(0.0),
+                      double              voxelSize = 1.0,
+                      double              halfWidth = 3.0,
+                      const Vec3d&        origin = Vec3d(0.0),
+                      const std::string&  name = "sphere_fog",
+                      StatsMode           sMode = StatsMode::Default,
+                      CheckMode           cMode = CheckMode::Default,
+                      const BufferT&      buffer = BufferT());
+
+template<typename BuildT, typename BufferT = HostBuffer>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeSphere(double              radius = 100.0,
+                      const Vec3d&        center = Vec3d(0.0),
+                      double              voxelSize = 1.0,
+                      double              halfWidth = 3.0,
+                      const Vec3d&        origin = Vec3d(0.0),
+                      const std::string&  name = "sphere_fog",
+                      StatsMode           sMode = StatsMode::Default,
+                      CheckMode           cMode = CheckMode::Default,
+                      float               tolerance = -1.0f,
+                      bool                ditherOn = false,
+                      const BufferT&      buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Returns a handle to a PointDataGrid containing points scattered
+///        on the surface of a sphere.
+///
+/// @param pointsPerVoxel Number of point per voxel on on the surface
+/// @param radius         Radius of sphere in world units
+/// @param center         Center of sphere in world units
+/// @param voxelSize      Size of a voxel in world units
+/// @param origin         Origin of grid in world units
+/// @param name           Name of the grid
+/// @param mode           Mode of computation for the checksum.
+/// @param buffer         Buffer used for memory allocation by the handle
+///
+/// @details The @c BuildT template parameter must be float (default) or double.
+template<typename BuildT = float, typename BufferT = HostBuffer>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createPointSphere(int                 pointsPerVoxel = 1,
+                  double              radius = 100.0,
+                  const Vec3d&        center = Vec3d(0.0),
+                  double              voxelSize = 1.0,
+                  const Vec3d&        origin = Vec3d(0.0),
+                  const std::string&  name = "sphere_points",
+                  CheckMode           mode = CheckMode::Default,
+                  const BufferT&      buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Returns a handle to a narrow-band level set of a torus in the xz-plane
+///
+/// @param majorRadius Major radius of torus in world units
+/// @param minorRadius Minor radius of torus in world units
+/// @param center      Center of torus in world units
+/// @param voxelSize   Size of a voxel in world units
+/// @param halfWidth   Half-width of narrow band in voxel units
+/// @param origin      Origin of grid in world units
+/// @param name        Name of the grid
+/// @param sMode       Mode of computation for the statistics.
+/// @param cMode       Mode of computation for the checksum.
+/// @param tolerance   Global error tolerance use when VoxelT = FpN
+/// @param ditherOn    If true dithering will be applied when VoxelT = {Fp4,Fp8,Fp16,FpN}
+/// @param buffer      Buffer used for memory allocation by the handle
+///
+/// @details The @c BuildT template parameter must be one of the following:
+///          float (default), double, Fp4, Fp8, Fp16 or FpN. The @c tolerance
+///          argument is only used when BuildT is set to FpN.
+template<typename BuildT = float, typename BufferT = HostBuffer>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetTorus(double              majorRadius = 100.0,
+                    double              minorRadius = 50.0,
+                    const Vec3d&        center = Vec3d(0.0),
+                    double              voxelSize = 1.0,
+                    double              halfWidth = 3.0,
+                    const Vec3d&        origin = Vec3d(0.0),
+                    const std::string&  name = "torus_ls",
+                    StatsMode           sMode = StatsMode::Default,
+                    CheckMode           cMode = CheckMode::Default,
+                    const BufferT&      buffer = BufferT());
+
+template<typename BuildT, typename BufferT = HostBuffer>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetTorus(double              majorRadius = 100.0,
+                    double              minorRadius = 50.0,
+                    const Vec3d&        center = Vec3d(0.0),
+                    double              voxelSize = 1.0,
+                    double              halfWidth = 3.0,
+                    const Vec3d&        origin = Vec3d(0.0),
+                    const std::string&  name = "torus_ls",
+                    StatsMode           sMode = StatsMode::Default,
+                    CheckMode           cMode = CheckMode::Default,
+                    float               tolerance = -1.0f,
+                    bool                ditherOn = false,
+                    const BufferT&      buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Returns a handle to a sparse fog volume of a torus in the xz-plane such
+///        that the exterior is 0 and inactive, the interior is active
+///        with values varying smoothly from 0 at the surface of the
+///        torus to 1 at the halfWidth and interior of the torus.
+///
+/// @param majorRadius Major radius of torus in world units
+/// @param minorRadius Minor radius of torus in world units
+/// @param center      Center of torus in world units
+/// @param voxelSize   Size of a voxel in world units
+/// @param halfWidth   Half-width of narrow band in voxel units
+/// @param origin      Origin of grid in world units
+/// @param name        Name of the grid
+/// @param sMode       Mode of computation for the statistics.
+/// @param cMode       Mode of computation for the checksum.
+/// @param tolerance   Global error tolerance use when VoxelT = FpN
+/// @param ditherOn    If true dithering will be applied when VoxelT = {Fp4,Fp8,Fp16,FpN}
+/// @param buffer      Buffer used for memory allocation by the handle
+///
+/// @details The @c BuildT template parameter must be one of the following:
+///          float (default), double, Fp4, Fp8, Fp16 or FpN. The @c tolerance
+///          argument is only used when BuildT is set to FpN.
+template<typename BuildT = float, typename BufferT = HostBuffer>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeTorus(double              majorRadius = 100.0,
+                     double              minorRadius = 50.0,
+                     const Vec3d&        center = Vec3d(0.0),
+                     double              voxelSize = 1.0,
+                     double              halfWidth = 3.0,
+                     const Vec3d&        origin = Vec3d(0.0),
+                     const std::string&  name = "torus_fog",
+                     StatsMode           sMode = StatsMode::Default,
+                     CheckMode           cMode = CheckMode::Default,
+                     const BufferT&      buffer = BufferT());
+
+template<typename BuildT, typename BufferT = HostBuffer>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeTorus(double              majorRadius = 100.0,
+                     double              minorRadius = 50.0,
+                     const Vec3d&        center = Vec3d(0.0),
+                     double              voxelSize = 1.0,
+                     double              halfWidth = 3.0,
+                     const Vec3d&        origin = Vec3d(0.0),
+                     const std::string&  name = "torus_fog_FpN",
+                     StatsMode           sMode = StatsMode::Default,
+                     CheckMode           cMode = CheckMode::Default,
+                     float               tolerance = -1.0f,
+                     bool                ditherOn = false,
+                     const BufferT&      buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Returns a handle to a PointDataGrid containing points scattered
+///        on the surface of a torus.
+///
+/// @param pointsPerVoxel Number of point per voxel on on the surface
+/// @param majorRadius    Major radius of torus in world units
+/// @param minorRadius    Minor radius of torus in world units
+/// @param center         Center of torus in world units
+/// @param voxelSize      Size of a voxel in world units
+/// @param origin         Origin of grid in world units
+/// @param name           Name of the grid
+/// @param cMode          Mode of computation for the checksum.
+/// @param buffer         Buffer used for memory allocation by the handle
+//
+/// @details The @c BuildT template parameter must be float (default) or double.
+template<typename BuildT = float, typename BufferT = HostBuffer>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createPointTorus(int                 pointsPerVoxel = 1, // half-width of narrow band in voxel units
+                 double              majorRadius = 100.0, // major radius of torus in world units
+                 double              minorRadius = 50.0, // minor radius of torus in world units
+                 const Vec3d&        center = Vec3d(0.0), // center of torus in world units
+                 double              voxelSize = 1.0, // size of a voxel in world units
+                 const Vec3d&        origin = Vec3d(0.0f), // origin of grid in world units
+                 const std::string&  name = "torus_points", // name of grid
+                 CheckMode           cMode = CheckMode::Default,
+                 const BufferT&      buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Returns a handle to a narrow-band level set of a box
+///
+/// @param width     Width of box in world units
+/// @param height    Height of box in world units
+/// @param depth     Depth of box in world units
+/// @param center    Center of box in world units
+/// @param voxelSize Size of a voxel in world units
+/// @param halfWidth Half-width of narrow band in voxel units
+/// @param origin    Origin of grid in world units
+/// @param name      Name of the grid
+/// @param sMode     Mode of computation for the statistics.
+/// @param cMode     Mode of computation for the checksum.
+/// @param tolerance Global error tolerance use when VoxelT = FpN
+/// @param ditherOn  If true dithering will be applied when VoxelT = {Fp4,Fp8,Fp16,FpN}
+/// @param buffer    Buffer used for memory allocation by the handle
+///
+/// @details The @c BuildT template parameter must be one of the following:
+///          float (default), double, Fp4, Fp8, Fp16 or FpN. The @c tolerance
+///          argument is only used when BuildT is set to FpN.
+template<typename BuildT = float, typename BufferT = HostBuffer>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetBox(double              width = 40.0,
+                  double              height = 60.0,
+                  double              depth = 100.0,
+                  const Vec3d& center = Vec3d(0.0),
+                  double              voxelSize = 1.0,
+                  double              halfWidth = 3.0,
+                  const Vec3d&        origin = Vec3d(0.0),
+                  const std::string&  name = "box_ls",
+                  StatsMode           sMode = StatsMode::Default,
+                  CheckMode           cMode = CheckMode::Default,
+                  const BufferT&      buffer = BufferT());
+
+template<typename BuildT, typename BufferT = HostBuffer>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetBox(double              width = 40.0,
+                  double              height = 60.0,
+                  double              depth = 100.0,
+                  const Vec3d& center = Vec3d(0.0),
+                  double              voxelSize = 1.0,
+                  double              halfWidth = 3.0,
+                  const Vec3d&        origin = Vec3d(0.0),
+                  const std::string&  name = "box_ls_FpN",
+                  StatsMode           sMode = StatsMode::Default,
+                  CheckMode           cMode = CheckMode::Default,
+                  float               tolerance = -1.0f,
+                  bool                ditherOn = false,
+                  const BufferT&      buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Returns a handle to a sparse fog volume of a box such
+///        that the exterior is 0 and inactive, the interior is active
+///        with values varying smoothly from 0 at the surface of the
+///        box to 1 at the halfWidth and interior of the box.
+///
+/// @param width     Width of box in world units
+/// @param height    Height of box in world units
+/// @param depth     Depth of box in world units
+/// @param center    Center of box in world units
+/// @param voxelSize Size of a voxel in world units
+/// @param halfWidth Half-width of narrow band in voxel units
+/// @param origin    Origin of grid in world units
+/// @param name      Name of the grid
+/// @param sMode     Mode of computation for the statistics.
+/// @param cMode     Mode of computation for the checksum.
+/// @param tolerance Global error tolerance use when VoxelT = FpN
+/// @param ditherOn  If true dithering will be applied when VoxelT = {Fp4,Fp8,Fp16,FpN}
+/// @param buffer    Buffer used for memory allocation by the handle
+///
+/// @details The @c BuildT template parameter must be one of the following:
+///          float (default), double, Fp4, Fp8, Fp16 or FpN. The @c tolerance
+///          argument is only used when BuildT is set to FpN.
+template<typename BuildT = float, typename BufferT = HostBuffer>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeBox(double              width = 40.0,
+                   double              height = 60.0,
+                   double              depth = 100.0,
+                   const Vec3d& center = Vec3d(0.0),
+                   double              voxelSize = 1.0,
+                   double              halfWidth = 3.0,
+                   const Vec3d&        origin = Vec3d(0.0),
+                   const std::string&  name = "box_fog",
+                   StatsMode           sMode = StatsMode::Default,
+                   CheckMode           cMode = CheckMode::Default,
+                   const BufferT&      buffer = BufferT());
+
+template<typename BuildT, typename BufferT = HostBuffer>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeBox(double              width = 40.0,
+                   double              height = 60.0,
+                   double              depth = 100.0,
+                   const Vec3d& center = Vec3d(0.0),
+                   double              voxelSize = 1.0,
+                   double              halfWidth = 3.0,
+                   const Vec3d&        origin = Vec3d(0.0),
+                   const std::string&  name = "box_fog_FpN",
+                   StatsMode           sMode = StatsMode::Default,
+                   CheckMode           cMode = CheckMode::Default,
+                   float               tolerance = -1.0f,
+                   bool                ditherOn = false,
+                   const BufferT&      buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Returns a handle to a narrow-band level set of a octahedron
+///
+/// @param scale     Scale of octahedron in world units
+/// @param center    Center of octahedron in world units
+/// @param voxelSize Size of a voxel in world units
+/// @param halfWidth Half-width of narrow band in voxel units
+/// @param origin    Origin of grid in world units
+/// @param name      Name of the grid
+/// @param sMode     Mode of computation for the statistics.
+/// @param cMode     Mode of computation for the checksum.
+/// @param tolerance Global error tolerance use when VoxelT = FpN
+/// @param ditherOn  If true dithering will be applied when VoxelT = {Fp4,Fp8,Fp16,FpN}
+/// @param buffer    Buffer used for memory allocation by the handle
+///
+/// @details The @c BuildT template parameter must be one of the following:
+///          float (default), double, Fp4, Fp8, Fp16 or FpN. The @c tolerance
+///          argument is only used when BuildT is set to FpN.
+template<typename BuildT = float, typename BufferT = HostBuffer>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetOctahedron(double              scale = 100.0,
+                         const Vec3d&        center = Vec3d(0.0),
+                         double              voxelSize = 1.0,
+                         double              halfWidth = 3.0,
+                         const Vec3d&        origin = Vec3d(0.0),
+                         const std::string&  name = "octadedron_ls",
+                         StatsMode           sMode = StatsMode::Default,
+                         CheckMode           cMode = CheckMode::Default,
+                         const BufferT&      buffer = BufferT());
+
+template<typename BuildT, typename BufferT = HostBuffer>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetOctahedron(double              scale = 100.0,
+                         const Vec3d&        center = Vec3d(0.0),
+                         double              voxelSize = 1.0,
+                         double              halfWidth = 3.0,
+                         const Vec3d&        origin = Vec3d(0.0),
+                         const std::string&  name = "octadedron_ls_FpN",
+                         StatsMode           sMode = StatsMode::Default,
+                         CheckMode           cMode = CheckMode::Default,
+                         float               tolerance = -1.0f,
+                         bool                ditherOn = false,
+                         const BufferT&      buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Returns a handle to a sparse fog volume of an octahedron such
+///        that the exterior is 0 and inactive, the interior is active
+///        with values varying smoothly from 0 at the surface of the
+///        octahedron to 1 at the halfWidth and interior of the octahedron.
+///
+/// @param scale     Scale of octahedron in world units
+/// @param center    Center of box in world units
+/// @param voxelSize Size of a voxel in world units
+/// @param halfWidth Half-width of narrow band in voxel units
+/// @param origin    Origin of grid in world units
+/// @param name      Name of the grid
+/// @param sMode     Mode of computation for the statistics.
+/// @param cMode     Mode of computation for the checksum.
+/// @param tolerance Global error tolerance use when VoxelT = FpN
+/// @param ditherOn  If true dithering will be applied when VoxelT = {Fp4,Fp8,Fp16,FpN}
+/// @param buffer    Buffer used for memory allocation by the handle
+///
+/// @details The @c BuildT template parameter must be one of the following:
+///          float (default), double, Fp4, Fp8, Fp16 or FpN. The @c tolerance
+///          argument is only used when BuildT is set to FpN.
+template<typename BuildT = float, typename BufferT = HostBuffer>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeOctahedron(double              scale = 100.0,
+                          const Vec3d& center = Vec3d(0.0),
+                          double              voxelSize = 1.0,
+                          double              halfWidth = 3.0,
+                          const Vec3d&        origin = Vec3d(0.0),
+                          const std::string&  name = "octadedron_fog",
+                          StatsMode           sMode = StatsMode::Default,
+                          CheckMode           cMode = CheckMode::Default,
+                          const BufferT&      buffer = BufferT());
+
+template<typename BuildT, typename BufferT = HostBuffer>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeOctahedron(double              scale = 100.0,
+                          const Vec3d& center = Vec3d(0.0),
+                          double              voxelSize = 1.0,
+                          double              halfWidth = 3.0,
+                          const Vec3d&        origin = Vec3d(0.0),
+                          const std::string&  name = "octadedron_fog_FpN",
+                          StatsMode           sMode = StatsMode::Default,
+                          CheckMode           cMode = CheckMode::Default,
+                          float               tolerance = -1.0f,
+                          bool                ditherOn = false,
+                          const BufferT&      buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Returns a handle to a narrow-band level set of a bounding-box (= wireframe of a box)
+///
+/// @param width     Width of box in world units
+/// @param height    Height of box in world units
+/// @param depth     Depth of box in world units
+/// @param thickness Thickness of the wire in world units
+/// @param center    Center of bbox in world units
+/// @param voxelSize Size of a voxel in world units
+/// @param halfWidth Half-width of narrow band in voxel units
+/// @param origin    Origin of grid in world units
+/// @param name      Name of the grid
+/// @param sMode     Mode of computation for the statistics.
+/// @param cMode     Mode of computation for the checksum.
+/// @param tolerance Global error tolerance use when VoxelT = FpN
+/// @param ditherOn  If true dithering will be applied when VoxelT = {Fp4,Fp8,Fp16,FpN}
+/// @param buffer    Buffer used for memory allocation by the handle
+///
+/// @details The @c BuildT template parameter must be one of the following:
+///          float (default), double, Fp4, Fp8, Fp16 or FpN. The @c tolerance
+///          argument is only used when BuildT is set to FpN.
+template<typename BuildT = float, typename BufferT = HostBuffer>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetBBox(double              width = 40.0,
+                   double              height = 60.0,
+                   double              depth = 100.0,
+                   double              thickness = 10.0,
+                   const Vec3d&        center = Vec3d(0.0),
+                   double              voxelSize = 1.0,
+                   double              halfWidth = 3.0,
+                   const Vec3d&        origin = Vec3d(0.0),
+                   const std::string&  name = "bbox_ls",
+                   StatsMode           sMode = StatsMode::Default,
+                   CheckMode           cMode = CheckMode::Default,
+                   const BufferT&      buffer = BufferT());
+
+template<typename BuildT, typename BufferT = HostBuffer>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetBBox(double              width = 40.0,
+                   double              height = 60.0,
+                   double              depth = 100.0,
+                   double              thickness = 10.0,
+                   const Vec3d&        center = Vec3d(0.0),
+                   double              voxelSize = 1.0,
+                   double              halfWidth = 3.0,
+                   const Vec3d&        origin = Vec3d(0.0),
+                   const std::string&  name = "bbox_ls_FpN",
+                   StatsMode           sMode = StatsMode::Default,
+                   CheckMode           cMode = CheckMode::Default,
+                   float               tolerance = -1.0f,
+                   bool                ditherOn = false,
+                   const BufferT&      buffer = BufferT());
+
+
+//================================================================================================
+
+/// @brief Returns a handle to a PointDataGrid containing points scattered
+///        on the surface of a box.
+///
+/// @param pointsPerVoxel Number of point per voxel on on the surface
+/// @param width     Width of box in world units
+/// @param height    Height of box in world units
+/// @param depth     Depth of box in world units
+/// @param center    Center of box in world units
+/// @param voxelSize Size of a voxel in world units
+/// @param origin    Origin of grid in world units
+/// @param name      Name of the grid
+/// @param mode      Mode of computation for the checksum.
+/// @param buffer    Buffer used for memory allocation by the handle
+template<typename BuildT = float, typename BufferT = HostBuffer>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createPointBox(int                 pointsPerVoxel = 1, // half-width of narrow band in voxel units
+               double              width = 40.0, // width of box in world units
+               double              height = 60.0, // height of box in world units
+               double              depth = 100.0, // depth of box in world units
+               const Vec3d& center = Vec3d(0.0), // center of box in world units
+               double              voxelSize = 1.0, // size of a voxel in world units
+               const Vec3d&        origin = Vec3d(0.0), // origin of grid in world units
+               const std::string&  name = "box_points", // name of grid
+               CheckMode           mode = CheckMode::Default,
+               const BufferT&      buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Given an input NanoVDB voxel grid this methods returns a GridHandle to another NanoVDB
+///        PointDataGrid with points scattered in the active leaf voxels of in input grid. Note, the
+///        coordinates of the points are encoded as blind data in world-space.
+///
+/// @param srcGrid        Const input grid used to determine the active voxels to scatter points into
+/// @param pointsPerVoxel Number of point per voxel on on the surface
+/// @param name           Name of the grid
+/// @param mode           Mode of computation for the checksum.
+/// @param buffer         Buffer used for memory allocation by the handle
+template<typename SrcBuildT = float, typename BufferT = HostBuffer>
+inline GridHandle<BufferT>
+createPointScatter(const NanoGrid<SrcBuildT>& srcGrid, // source grid used to scatter points into
+                   int                        pointsPerVoxel = 1, // half-width of narrow band in voxel units
+                   const std::string&         name = "point_scatter", // name of grid
+                   CheckMode                  mode = CheckMode::Default,
+                   const BufferT&             buffer = BufferT());
+
+//================================================================================================
+
+namespace {
+
+/// @brief Returns a shared pointer to a build::Grid containing a narrow-band SDF values for a sphere
+///
+/// @brief Note, this is not (yet) a valid level set SDF field since values inside sphere (and outside
+///        the narrow band) are still undefined. Call builder::sdfToLevelSet() to set those
+///        values or alternatively call builder::levelSetToFog to generate a FOG volume.
+///
+/// @details The @c BuildT template parameter must be one of the following:
+///          float (default), double, Fp4, Fp8, Fp16 or FpN.
+template<typename BuildT>
+std::shared_ptr<build::Grid<BuildT>>
+initSphere(double              radius, // radius of sphere in world units
+           const Vec3d&        center, // center of sphere in world units
+           double              voxelSize, // size of a voxel in world units
+           double              halfWidth, // half-width of narrow band in voxel units
+           const Vec3d&        origin) // origin of grid in world units
+{
+    using GridT = build::Grid<BuildT>;
+    using ValueT = typename BuildToValueMap<BuildT>::type;
+    static_assert(util::is_floating_point<ValueT>::value, "initSphere: expect floating point");
+    if (!(radius > 0))
+        throw std::runtime_error("Sphere: radius must be positive!");
+    if (!(voxelSize > 0))
+        throw std::runtime_error("Sphere: voxelSize must be positive!");
+    if (!(halfWidth > 0))
+        throw std::runtime_error("Sphere: halfWidth must be positive!");
+
+    auto grid = std::make_shared<GridT>(ValueT(halfWidth * voxelSize));
+    grid->setTransform(voxelSize, origin);
+
+    // Define radius of sphere with narrow-band in voxel units
+    const ValueT r0 = radius / ValueT(voxelSize), rmax = r0 + ValueT(halfWidth);
+
+    // Radius below the Nyquist frequency
+    if (r0 < ValueT(1.5f)) return grid;
+
+    // Define center of sphere in voxel units
+    const math::Vec3<ValueT> c(ValueT(center[0] - origin[0]) / ValueT(voxelSize),
+                               ValueT(center[1] - origin[1]) / ValueT(voxelSize),
+                               ValueT(center[2] - origin[2]) / ValueT(voxelSize));
+
+    // Define bounds of the voxel coordinates
+    const int imin = math::Floor(c[0] - rmax), imax = math::Ceil(c[0] + rmax);
+    const int jmin = math::Floor(c[1] - rmax), jmax = math::Ceil(c[1] + rmax);
+    const int kmin = math::Floor(c[2] - rmax), kmax = math::Ceil(c[2] + rmax);
+
+    const util::Range<1,int> range(imin, imax+1, 32);
+
+    auto kernel = [&](const util::Range<1,int> &r) {
+        auto acc = grid->getWriteAccessor();
+        Coord ijk;
+        int &i = ijk[0], &j = ijk[1], &k = ijk[2], m = 1;
+        // Compute signed distances to sphere using leapfrogging in k
+        for (i = r.begin(); i < r.end(); ++i) {
+            const auto x2 = math::Pow2(ValueT(i) - c[0]);
+            for (j = jmin; j <= jmax; ++j) {
+                const auto x2y2 = math::Pow2(ValueT(j) - c[1]) + x2;
+                for (k = kmin; k <= kmax; k += m) {
+                    m = 1;
+                    const auto v = math::Sqrt(x2y2 + math::Pow2(ValueT(k) - c[2])) - r0; // Distance in voxel units
+                    const auto d = v < 0 ? -v : v;
+                    if (d < halfWidth) { // inside narrow band
+                        acc.setValue(ijk, ValueT(voxelSize) * v); // distance in world units
+                    } else { // outside narrow band
+                        m += math::Floor(d - halfWidth); // leapfrog
+                    }
+                } //end leapfrog over k
+            } //end loop over j
+        } //end loop over i
+    };// kernel
+#ifdef NANOVDB_PARALLEL_PRIMITIVES
+    util::forEach(range, kernel);
+#else
+    kernel(range);
+#endif
+    return grid;
+} // initSphere
+
+template<typename BuildT>
+std::shared_ptr<build::Grid<BuildT>>
+initTorus(double              radius1, // major radius of torus in world units
+          double              radius2, // minor radius of torus in world units
+          const Vec3d&        center, // center of torus in world units
+          double              voxelSize, // size of a voxel in world units
+          double              halfWidth, // half-width of narrow band in voxel units
+          const Vec3d&        origin) // origin of grid in world units
+{
+    using GridT = build::Grid<BuildT>;
+    using ValueT = typename BuildToValueMap<BuildT>::type;
+    static_assert(util::is_floating_point<ValueT>::value, "initTorus: expect floating point");
+    if (!(radius2 > 0))
+        throw std::runtime_error("Torus: radius2 must be positive!");
+    if (!(radius1 > radius2))
+        throw std::runtime_error("Torus: radius1 must be larger than radius2!");
+    if (!(voxelSize > 0))
+        throw std::runtime_error("Torus: voxelSize must be positive!");
+    if (!(halfWidth > 0))
+        throw std::runtime_error("Torus: halfWidth must be positive!");
+
+    auto grid = std::make_shared<GridT>(ValueT(halfWidth * voxelSize));
+    grid->setTransform(voxelSize, origin);
+
+    // Define size of torus with narrow-band in voxel units
+    const ValueT r1 = radius1 / ValueT(voxelSize), r2 = radius2 / ValueT(voxelSize), rmax1 = r1 + r2 + ValueT(halfWidth), rmax2 = r2 + ValueT(halfWidth);
+
+    // Radius below the Nyquist frequency
+    if (r2 < ValueT(1.5)) return grid;
+
+    // Define center of torus in voxel units
+    const math::Vec3<ValueT> c(ValueT(center[0] - origin[0]) / ValueT(voxelSize),
+                               ValueT(center[1] - origin[1]) / ValueT(voxelSize),
+                               ValueT(center[2] - origin[2]) / ValueT(voxelSize));
+
+    // Define bounds of the voxel coordinates
+    const int imin = math::Floor(c[0] - rmax1), imax = math::Ceil(c[0] + rmax1);
+    const int jmin = math::Floor(c[1] - rmax2), jmax = math::Ceil(c[1] + rmax2);
+    const int kmin = math::Floor(c[2] - rmax1), kmax = math::Ceil(c[2] + rmax1);
+
+    const util::Range<1,int> range(imin, imax+1, 32);
+    auto kernel = [&](const util::Range<1,int> &r) {
+        auto acc = grid->getWriteAccessor();
+        Coord ijk;
+        int &i = ijk[0], &j = ijk[1], &k = ijk[2], m = 1;
+        // Compute signed distances to torus using leapfrogging in k
+        for (i = r.begin(); i < r.end(); ++i) {
+            const auto x2 = math::Pow2(ValueT(i) - c[0]);
+            for (k = kmin; k <= kmax; ++k) {
+                const auto x2z2 = math::Pow2(math::Sqrt(math::Pow2(ValueT(k) - c[2]) + x2) - r1);
+                for (j = jmin; j <= jmax; j += m) {
+                    m = 1;
+                    const auto v = math::Sqrt(x2z2 + math::Pow2(ValueT(j) - c[1])) - r2; // Distance in voxel units
+                    const auto d = v < 0 ? -v : v;
+                    if (d < halfWidth) { // inside narrow band
+                        acc.setValue(ijk, ValueT(voxelSize) * v); // distance in world units
+                    } else { // outside narrow band
+                        m += math::Floor(d - halfWidth); // leapfrog
+                    }
+                } //end leapfrog over k
+            } //end loop over j
+        } //end loop over i
+     }; // kernel
+
+#ifdef NANOVDB_PARALLEL_PRIMITIVES
+    util::forEach(range, kernel);
+#else
+    kernel(range);
+#endif
+
+    return grid;
+} // initTorus
+
+template<typename BuildT>
+std::shared_ptr<build::Grid<BuildT>>
+initBox(double       width, // major radius of torus in world units
+        double       height, // minor radius of torus in world units
+        double       depth,
+        const Vec3d& center, // center of box in world units
+        double       voxelSize, // size of a voxel in world units
+        double       halfWidth, // half-width of narrow band in voxel units
+        const Vec3d& origin) // origin of grid in world units
+{
+    using GridT = build::Grid<BuildT>;
+    using ValueT = typename BuildToValueMap<BuildT>::type;
+    static_assert(util::is_floating_point<ValueT>::value, "initBox: expect floating point");
+    using Vec3T = math::Vec3<ValueT>;
+    if (!(width > 0))
+        throw std::runtime_error("Box: width must be positive!");
+    if (!(height > 0))
+        throw std::runtime_error("Box: height must be positive!");
+    if (!(depth > 0))
+        throw std::runtime_error("Box: depth must be positive!");
+
+    if (!(voxelSize > 0))
+        throw std::runtime_error("Box: voxelSize must be positive!");
+    if (!(halfWidth > 0))
+        throw std::runtime_error("Box: halfWidth must be positive!");
+
+    auto grid = std::make_shared<GridT>(ValueT(halfWidth * voxelSize));
+    grid->setTransform(voxelSize, origin);
+
+    // Define size of box with narrow-band in voxel units
+    const Vec3T r(width  / (2 * ValueT(voxelSize)),
+                  height / (2 * ValueT(voxelSize)),
+                  depth  / (2 * ValueT(voxelSize)));
+
+    // Below the Nyquist frequency
+    if (r.min() < ValueT(1.5)) return grid;
+
+    // Define center of box in voxel units
+    const Vec3T c(ValueT(center[0] - origin[0]) / ValueT(voxelSize),
+                  ValueT(center[1] - origin[1]) / ValueT(voxelSize),
+                  ValueT(center[2] - origin[2]) / ValueT(voxelSize));
+
+    // Define utility functions
+    auto Pos = [](ValueT x) { return x > 0 ? x : 0; };
+    auto Neg = [](ValueT x) { return x < 0 ? x : 0; };
+
+    // Define bounds of the voxel coordinates
+    const math::BBox<Vec3T> b(c - r - Vec3T(ValueT(halfWidth)), c + r + Vec3T(ValueT(halfWidth)));
+    const CoordBBox   bbox(Coord(math::Floor(b[0][0]), math::Floor(b[0][1]), math::Floor(b[0][2])),
+                           Coord(math::Ceil(b[1][0]),  math::Ceil(b[1][1]),  math::Ceil(b[1][2])));
+    const util::Range<1,int> range(bbox[0][0], bbox[1][0]+1, 32);
+
+    // Compute signed distances to box using leapfrogging in k
+    auto kernel = [&](const util::Range<1,int> &ra) {
+        auto acc = grid->getWriteAccessor();
+        int m = 1;
+        for (Coord p(ra.begin(),bbox[0][1],bbox[0][2]); p[0] < ra.end(); ++p[0]) {
+            const auto q1 = math::Abs(ValueT(p[0]) - c[0]) - r[0];
+            const auto x2 = math::Pow2(Pos(q1));
+            for (p[1] = bbox[0][1]; p[1] <= bbox[1][1]; ++p[1]) {
+                const auto q2 = math::Abs(ValueT(p[1]) - c[1]) - r[1];
+                const auto q0 = math::Max(q1, q2);
+                const auto x2y2 = x2 + math::Pow2(Pos(q2));
+                for (p[2] = bbox[0][2]; p[2] <= bbox[1][2]; p[2] += m) {
+                    m = 1;
+                    const auto q3 = math::Abs(ValueT(p[2]) - c[2]) - r[2];
+                    const auto v = math::Sqrt(x2y2 + math::Pow2(Pos(q3))) + Neg(math::Max(q0, q3)); // Distance in voxel units
+                    const auto d = math::Abs(v);
+                    if (d < halfWidth) { // inside narrow band
+                        acc.setValue(p, ValueT(voxelSize) * v); // distance in world units
+                    } else { // outside narrow band
+                        m += math::Floor(d - halfWidth); // leapfrog
+                    }
+                } //end leapfrog over k
+            } //end loop over j
+        } //end loop over i
+    }; // kernel
+#ifdef NANOVDB_PARALLEL_PRIMITIVES
+    util::forEach(range, kernel);
+#else
+    kernel(range);
+#endif
+    return grid;
+} // initBox
+
+template<typename BuildT>
+std::shared_ptr<build::Grid<BuildT>>
+initBBox(double       width, // width of the bbox in world units
+         double       height, // height of the bbox in world units
+         double       depth, // depth of the bbox in world units
+         double       thickness, // thickness of the wire in world units
+         const Vec3d& center, // center of bbox in world units
+         double       voxelSize, // size of a voxel in world units
+         double       halfWidth, // half-width of narrow band in voxel units
+         const Vec3d& origin) // origin of grid in world units
+{
+    using GridT = build::Grid<BuildT>;
+    using ValueT = typename BuildToValueMap<BuildT>::type;
+    static_assert(util::is_floating_point<ValueT>::value, "initBBox: expect floating point");
+    using Vec3T = math::Vec3<ValueT>;
+    if (!(width > 0))
+        throw std::runtime_error("BBox: width must be positive!");
+    if (!(height > 0))
+        throw std::runtime_error("BBox: height must be positive!");
+    if (!(depth > 0))
+        throw std::runtime_error("BBox: depth must be positive!");
+    if (!(thickness > 0))
+        throw std::runtime_error("BBox: thickness must be positive!");
+    if (!(voxelSize > 0.0))
+        throw std::runtime_error("BBox: voxelSize must be positive!");
+
+
+    auto grid = std::make_shared<GridT>(ValueT(halfWidth * voxelSize));
+    grid->setTransform(voxelSize, origin);
+
+    // Define size of bbox with narrow-band in voxel units
+    const Vec3T  r(width / (2 * ValueT(voxelSize)),
+                  height / (2 * ValueT(voxelSize)),
+                  depth  / (2 * ValueT(voxelSize)));
+    const ValueT e = thickness / ValueT(voxelSize);
+
+    // Below the Nyquist frequency
+    if (r.min() < ValueT(1.5) || e < ValueT(1.5)) return grid;
+
+    // Define center of bbox in voxel units
+    const Vec3T c(ValueT(center[0] - origin[0]) / ValueT(voxelSize),
+                  ValueT(center[1] - origin[1]) / ValueT(voxelSize),
+                  ValueT(center[2] - origin[2]) / ValueT(voxelSize));
+
+    // Define utility functions
+    auto Pos = [](ValueT x) { return x > 0 ? x : 0; };
+    auto Neg = [](ValueT x) { return x < 0 ? x : 0; };
+
+    // Define bounds of the voxel coordinates
+    const math::BBox<Vec3T> b(c - r - Vec3T(e + ValueT(halfWidth)), c + r + Vec3T(e + ValueT(halfWidth)));
+    const CoordBBox   bbox(Coord(math::Floor(b[0][0]), math::Floor(b[0][1]), math::Floor(b[0][2])),
+                           Coord(math::Ceil(b[1][0]),  math::Ceil(b[1][1]),  math::Ceil(b[1][2])));
+    const util::Range<1,int> range(bbox[0][0], bbox[1][0]+1, 32);
+
+    // Compute signed distances to bbox using leapfrogging in k
+    auto kernel = [&](const util::Range<1,int> &ra) {
+        auto acc = grid->getWriteAccessor();
+        int m = 1;
+        for (Coord p(ra.begin(),bbox[0][1],bbox[0][2]); p[0] < ra.end(); ++p[0]) {
+            const ValueT px = math::Abs(ValueT(p[0]) - c[0]) - r[0];
+            const ValueT qx = math::Abs(ValueT(px) + e) - e;
+            const ValueT px2 = math::Pow2(Pos(px));
+            const ValueT qx2 = math::Pow2(Pos(qx));
+            for (p[1] = bbox[0][1]; p[1] <= bbox[1][1]; ++p[1]) {
+                const ValueT py = math::Abs(ValueT(p[1]) - c[1]) - r[1];
+                const ValueT qy = math::Abs(ValueT(py) + e) - e;
+                const ValueT qy2 = math::Pow2(Pos(qy));
+                const ValueT px2qy2 = px2 + qy2;
+                const ValueT qx2py2 = qx2 + math::Pow2(Pos(py));
+                const ValueT qx2qy2 = qx2 + qy2;
+                const ValueT a[3] = {math::Max(px, qy), math::Max(qx, py), math::Max(qx, qy)};
+                for (p[2] = bbox[0][2]; p[2] <= bbox[1][2]; p[2] += m) {
+                    m = 1;
+                    const ValueT pz = math::Abs(ValueT(p[2]) - c[2]) - r[2];
+                    const ValueT qz = math::Abs(ValueT(pz) + e) - e;
+                    const ValueT qz2 = math::Pow2(Pos(qz));
+                    const ValueT s1 = math::Sqrt(px2qy2 + qz2) + Neg(math::Max(a[0], qz));
+                    const ValueT s2 = math::Sqrt(qx2py2 + qz2) + Neg(math::Max(a[1], qz));
+                    const ValueT s3 = math::Sqrt(qx2qy2 + math::Pow2(Pos(pz))) + Neg(math::Max(a[2], pz));
+                    const ValueT v = math::Min(s1, math::Min(s2, s3)); // Distance in voxel units
+                    const ValueT d = math::Abs(v);
+                    if (d < halfWidth) { // inside narrow band
+                        acc.setValue(p, ValueT(voxelSize) * v); // distance in world units
+                    } else { // outside narrow band
+                        m += math::Floor(d - halfWidth); // leapfrog
+                    }
+                } //end leapfrog over k
+            } //end loop over j
+        } //end loop over i
+    }; //kernel
+#ifdef NANOVDB_PARALLEL_PRIMITIVES
+    util::forEach(range, kernel);
+#else
+    kernel(range);
+#endif
+
+    return grid;
+} // initBBox
+
+template<typename BuildT>
+std::shared_ptr<build::Grid<BuildT>>
+initOctahedron(double       scale, // scale of the octahedron in world units
+               const Vec3d& center, // center of octahedron in world units
+               double       voxelSize, // size of a voxel in world units
+               double       halfWidth, // half-width of narrow band in voxel units
+               const Vec3d& origin) // origin of grid in world units
+{
+    using GridT = build::Grid<BuildT>;
+    using ValueT = typename BuildToValueMap<BuildT>::type;
+    using Vec3T = math::Vec3<ValueT>;
+    static_assert(util::is_floating_point<ValueT>::value, "initOctahedron: expect floating point");
+
+    if (!(scale > 0)) throw std::runtime_error("Octahedron: width must be positive!");
+    if (!(voxelSize > 0)) throw std::runtime_error("Octahedron: voxelSize must be positive!");
+
+    auto grid = std::make_shared<GridT>(ValueT(halfWidth * voxelSize));
+    grid->setTransform(voxelSize, origin);
+
+    // Define size of octahedron with narrow-band in voxel units
+    const ValueT s = scale / (2 * ValueT(voxelSize));
+
+    // Below the Nyquist frequency
+    if ( s < ValueT(1.5) ) return grid;
+
+    // Define center of octahedron in voxel units
+    const Vec3T c(ValueT(center[0] - origin[0]) / ValueT(voxelSize),
+                  ValueT(center[1] - origin[1]) / ValueT(voxelSize),
+                  ValueT(center[2] - origin[2]) / ValueT(voxelSize));
+
+    // Define utility functions
+    auto sdf = [&s](ValueT x, ValueT y, ValueT z) {
+        const ValueT d = ValueT(0.5)*(z - y + s);
+        if (d < ValueT(0)) {
+            return Vec3T(x, y - s, z).length();
+        } else if (d > s) {
+            return Vec3T(x, y, z - s).length();
+        }
+        return Vec3T(x, y - s + d, z - d).length();
+    };
+
+    // Define bounds of the voxel coordinates
+    const math::BBox<Vec3T> b(c - Vec3T(s + ValueT(halfWidth)), c + Vec3T(s + ValueT(halfWidth)));
+    const CoordBBox   bbox(Coord(math::Floor(b[0][0]), math::Floor(b[0][1]), math::Floor(b[0][2])),
+                           Coord(math::Ceil(b[1][0]),  math::Ceil(b[1][1]),  math::Ceil(b[1][2])));
+    const util::Range<1,int> range(bbox[0][0], bbox[1][0]+1, 32);
+
+    // Compute signed distances to octahedron using leapfrogging in k
+    auto kernel = [&](const util::Range<1,int> &ra) {
+        auto acc = grid->getWriteAccessor();
+        int m = 1;
+        static const ValueT a = math::Sqrt(ValueT(1)/ValueT(3));
+        for (Coord p(ra.begin(),bbox[0][1],bbox[0][2]); p[0] < ra.end(); ++p[0]) {
+            const ValueT px = math::Abs(ValueT(p[0]) - c[0]);
+            for (p[1] = bbox[0][1]; p[1] <= bbox[1][1]; ++p[1]) {
+                const ValueT py = math::Abs(ValueT(p[1]) - c[1]);
+                for (p[2] = bbox[0][2]; p[2] <= bbox[1][2]; p[2] += m) {
+                    m = 1;
+                    const ValueT pz = math::Abs(ValueT(p[2]) - c[2]);
+                    ValueT d =  px + py + pz - s;
+                    ValueT v;
+                    if (ValueT(3)*px < d) {
+                        v = sdf(px, py, pz);
+                    } else if (ValueT(3)*py < d) {
+                        v = sdf(py, pz, px);
+                    } else if (ValueT(3)*pz < d) {
+                        v = sdf(pz, px, py);
+                    } else {
+                        v = a * d;
+                    }
+                    d = math::Abs(v);
+                    if (d < halfWidth) { // inside narrow band
+                        acc.setValue(p, ValueT(voxelSize) * v); // distance in world units
+                    } else { // outside narrow band
+                        m += math::Floor(d - halfWidth); // leapfrog
+                    }
+                } //end leapfrog over k
+            } //end loop over j
+        } //end loop over i
+     };// kernel
+#ifdef NANOVDB_PARALLEL_PRIMITIVES
+    util::forEach(range, kernel);
+#else
+    kernel(range);
+#endif
+    return grid;
+} // initOctahedron
+
+} // unnamed namespace
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::enable_if<util::is_same<BuildT, float, double>::value, GridHandle<BufferT>>::type
+createLevelSetSphere(double              radius, // radius of sphere in world units
+                     const Vec3d&        center, // center of sphere in world units
+                     double              voxelSize, // size of a voxel in world units
+                     double              halfWidth, // half-width of narrow band in voxel units
+                     const Vec3d&        origin, // origin of grid in world units
+                     const std::string&  name, // name of grid
+                     StatsMode           sMode, // mode of computation for the statistics
+                     CheckMode           cMode, // mode of computation for the checksum
+                     const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initSphere<BuildT>(radius, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
+    assert(handle);
+    return handle;
+} // createLevelSetSphere<T>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::enable_if<util::is_same<BuildT, Fp4, Fp8, Fp16>::value, GridHandle<BufferT>>::type
+createLevelSetSphere(double              radius, // radius of sphere in world units
+                     const Vec3d&        center, // center of sphere in world units
+                     double              voxelSize, // size of a voxel in world units
+                     double              halfWidth, // half-width of narrow band in voxel units
+                     const Vec3d&        origin, // origin of grid in world units
+                     const std::string&  name, // name of grid
+                     StatsMode           sMode, // mode of computation for the statistics
+                     CheckMode           cMode, // mode of computation for the checksum
+                     bool                ditherOn,
+                     const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initSphere<BuildT>(radius, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    converter.enableDithering(ditherOn);
+    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
+    assert(handle);
+    return handle;
+} // createLevelSetSphere<Fp4 or Fp8 or Fp16>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetSphere(double              radius, // radius of sphere in world units
+                     const Vec3d&        center, // center of sphere in world units
+                     double              voxelSize, // size of a voxel in world units
+                     double              halfWidth, // half-width of narrow band in voxel units
+                     const Vec3d&        origin, // origin of grid in world units
+                     const std::string&  name, // name of grid
+                     StatsMode           sMode, // mode of computation for the statistics
+                     CheckMode           cMode, // mode of computation for the checksum
+                     float               tolerance,// only used if VoxelT = FpN
+                     bool                ditherOn,
+                     const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initSphere<BuildT>(radius, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    converter.enableDithering(ditherOn);
+    AbsDiff oracle(tolerance);
+    auto handle = converter.template getHandle<BuildT, AbsDiff, BufferT>(oracle, buffer);
+    assert(handle);
+    return handle;
+} // createLevelSetSphere<FpN>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeSphere(double              radius, // radius of sphere in world units
+                      const Vec3d&        center, // center of sphere in world units
+                      double              voxelSize, // size of a voxel in world units
+                      double              halfWidth, // half-width of narrow band in voxel units
+                      const Vec3d&        origin, // origin of grid in world units
+                      const std::string&  name, // name of grid
+                      StatsMode           sMode, // mode of computation for the statistics
+                      CheckMode           cMode, // mode of computation for the checksum
+                      const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initSphere<BuildT>(radius, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    build::levelSetToFog(mgr, false);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
+    assert(handle);
+    return handle;
+} // createFogVolumeSphere<T>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeSphere(double              radius, // radius of sphere in world units
+                      const Vec3d&        center, // center of sphere in world units
+                      double              voxelSize, // size of a voxel in world units
+                      double              halfWidth, // half-width of narrow band in voxel units
+                      const Vec3d&        origin, // origin of grid in world units
+                      const std::string&  name, // name of grid
+                      StatsMode           sMode, // mode of computation for the statistics
+                      CheckMode           cMode, // mode of computation for the checksum
+                      float               tolerance,// only used if VoxelT = FpN
+                      bool                ditherOn,
+                      const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initSphere<BuildT>(radius, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    build::levelSetToFog(mgr, false);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    converter.enableDithering(ditherOn);
+    AbsDiff oracle(tolerance);
+    auto handle = converter.template getHandle<BuildT, AbsDiff, BufferT>(oracle, buffer);
+    assert(handle);
+    return handle;
+} // createFogVolumeSphere<FpN>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createPointSphere(int                 pointsPerVoxel, // number of points to be scattered in each active voxel
+                  double              radius, // radius of sphere in world units
+                  const Vec3d&        center, // center of sphere in world units
+                  double              voxelSize, // size of a voxel in world units
+                  const Vec3d&        origin, // origin of grid in world units
+                  const std::string&  name, // name of grid
+                  CheckMode           cMode, // mode of computation for the checksum
+                  const BufferT&      buffer)
+{
+    auto sphereHandle = createLevelSetSphere(radius, center, voxelSize, 0.5, origin, "dummy",
+                                             StatsMode::BBox, CheckMode::Disable, buffer);
+    assert(sphereHandle);
+    auto* sphereGrid = sphereHandle.template grid<BuildT>();
+    assert(sphereGrid);
+    auto pointHandle = createPointScatter(*sphereGrid, pointsPerVoxel, name, cMode, buffer);
+    assert(pointHandle);
+    return pointHandle;
+} // createPointSphere
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetTorus(double              majorRadius, // major radius of torus in world units
+                    double              minorRadius, // minor radius of torus in world units
+                    const Vec3d&        center, // center of torus in world units
+                    double              voxelSize, // size of a voxel in world units
+                    double              halfWidth, // half-width of narrow band in voxel units
+                    const Vec3d&        origin, // origin of grid in world units
+                    const std::string&  name, // name of grid
+                    StatsMode           sMode, // mode of computation for the statistics
+                    CheckMode           cMode, // mode of computation for the checksum
+                    const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initTorus<BuildT>(majorRadius, minorRadius, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
+    assert(handle);
+    return handle;
+} // createLevelSetTorus<T>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetTorus(double              majorRadius, // major radius of torus in world units
+                    double              minorRadius, // minor radius of torus in world units
+                    const Vec3d&        center, // center of torus in world units
+                    double              voxelSize, // size of a voxel in world units
+                    double              halfWidth, // half-width of narrow band in voxel units
+                    const Vec3d&        origin, // origin of grid in world units
+                    const std::string&  name, // name of grid
+                    StatsMode           sMode, // mode of computation for the statistics
+                    CheckMode           cMode, // mode of computation for the checksum
+                    float               tolerance,
+                    bool                ditherOn,
+                    const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initTorus<BuildT>(majorRadius, minorRadius, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    converter.enableDithering(ditherOn);
+    AbsDiff oracle(tolerance);
+    auto handle = converter.template getHandle<BuildT, AbsDiff, BufferT>(oracle, buffer);
+    assert(handle);
+    return handle;
+} // createLevelSetTorus<FpN>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeTorus(double              majorRadius, // major radius of torus in world units
+                     double              minorRadius, // minor radius of torus in world units
+                     const Vec3d&        center, // center of torus in world units
+                     double              voxelSize, // size of a voxel in world units
+                     double              halfWidth, // half-width of narrow band in voxel units
+                     const Vec3d&        origin, // origin of grid in world units
+                     const std::string&  name, // name of grid
+                     StatsMode           sMode, // mode of computation for the statistics
+                     CheckMode           cMode, // mode of computation for the checksum
+                     const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initTorus<BuildT>(majorRadius, minorRadius, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    build::levelSetToFog(mgr, false);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
+    assert(handle);
+    return handle;
+} // createFogVolumeTorus<T>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeTorus(double              majorRadius, // major radius of torus in world units
+                     double              minorRadius, // minor radius of torus in world units
+                     const Vec3d&        center, // center of torus in world units
+                     double              voxelSize, // size of a voxel in world units
+                     double              halfWidth, // half-width of narrow band in voxel units
+                     const Vec3d&        origin, // origin of grid in world units
+                     const std::string&  name, // name of grid
+                     StatsMode           sMode, // mode of computation for the statistics
+                     CheckMode           cMode, // mode of computation for the checksum
+                     float               tolerance,
+                     bool                ditherOn,
+                     const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initTorus<BuildT>(majorRadius, minorRadius, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    build::levelSetToFog(mgr, false);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    converter.enableDithering(ditherOn);
+    AbsDiff oracle(tolerance);
+    auto handle = converter.template getHandle<BuildT, AbsDiff, BufferT>(oracle, buffer);
+    assert(handle);
+    return handle;
+} // createFogVolumeTorus<FpN>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createPointTorus(int                 pointsPerVoxel, // number of points to be scattered in each active voxel
+                 double              majorRadius, // major radius of torus in world units
+                 double              minorRadius, // minor radius of torus in world units
+                 const Vec3d&        center, // center of torus in world units
+                 double              voxelSize, // size of a voxel in world units
+                 const Vec3d&        origin, // origin of grid in world units
+                 const std::string&  name, // name of grid
+                 CheckMode           cMode, // mode of computation for the checksum
+                 const BufferT&      buffer)
+{
+    auto torusHandle = createLevelSetTorus(majorRadius, minorRadius, center, voxelSize, 0.5f, origin,
+                                           "dummy", StatsMode::BBox, CheckMode::Disable, buffer);
+    assert(torusHandle);
+    auto* torusGrid = torusHandle.template grid<BuildT>();
+    assert(torusGrid);
+    auto pointHandle = createPointScatter(*torusGrid, pointsPerVoxel, name, cMode, buffer);
+    assert(pointHandle);
+    return pointHandle;
+} // createPointTorus<T>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetBox(double              width, // width of box in world units
+                  double              height, // height of box in world units
+                  double              depth, // depth of box in world units
+                  const Vec3d&        center, // center of box in world units
+                  double              voxelSize, // size of a voxel in world units
+                  double              halfWidth, // half-width of narrow band in voxel units
+                  const Vec3d&        origin, // origin of grid in world units
+                  const std::string&  name, // name of grid
+                  StatsMode           sMode, // mode of computation for the statistics
+                  CheckMode           cMode, // mode of computation for the checksum
+                  const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initBox<BuildT>(width, height, depth, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
+    assert(handle);
+    return handle;
+} // createLevelSetBox<T>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetBox(double              width, // width of box in world units
+                  double              height, // height of box in world units
+                  double              depth, // depth of box in world units
+                  const Vec3d&        center, // center of box in world units
+                  double              voxelSize, // size of a voxel in world units
+                  double              halfWidth, // half-width of narrow band in voxel units
+                  const Vec3d&        origin, // origin of grid in world units
+                  const std::string&  name, // name of grid
+                  StatsMode           sMode, // mode of computation for the statistics
+                  CheckMode           cMode, // mode of computation for the checksum
+                  float               tolerance,
+                  bool                ditherOn,
+                  const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initBox<BuildT>(width, height, depth, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    converter.enableDithering(ditherOn);
+    AbsDiff oracle(tolerance);
+    auto handle = converter.template getHandle<BuildT, AbsDiff, BufferT>(oracle, buffer);
+    assert(handle);
+    return handle;
+} // createLevelSetBox<FpN>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetOctahedron(double              scale, // scale of the octahedron in world units
+                         const Vec3d&        center, // center of box in world units
+                         double              voxelSize, // size of a voxel in world units
+                         double              halfWidth, // half-width of narrow band in voxel units
+                         const Vec3d&        origin, // origin of grid in world units
+                         const std::string&  name, // name of grid
+                         StatsMode           sMode, // mode of computation for the statistics
+                         CheckMode           cMode, // mode of computation for the checksum
+                         const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initOctahedron<BuildT>(scale, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
+    assert(handle);
+    return handle;
+} // createLevelSetOctahedron<T>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetOctahedron(double              scale, // scale of the octahedron in world units
+                         const Vec3d&        center, // center of box in world units
+                         double              voxelSize, // size of a voxel in world units
+                         double              halfWidth, // half-width of narrow band in voxel units
+                         const Vec3d&        origin, // origin of grid in world units
+                         const std::string&  name, // name of grid
+                         StatsMode           sMode, // mode of computation for the statistics
+                         CheckMode           cMode, // mode of computation for the checksum
+                         float               tolerance,
+                         bool                ditherOn,
+                         const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initOctahedron<BuildT>(scale, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    converter.enableDithering(ditherOn);
+    AbsDiff oracle(tolerance);
+    auto handle = converter.template getHandle<BuildT, AbsDiff, BufferT>(oracle, buffer);
+    assert(handle);
+    return handle;
+} // createLevelSetOctahedron<FpN>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetBBox(double              width, // width of bbox in world units
+                   double              height, // height of bbox in world units
+                   double              depth, // depth of bbox in world units
+                   double              thickness, // thickness of the wire in world units
+                   const Vec3d&        center, // center of bbox in world units
+                   double              voxelSize, // size of a voxel in world units
+                   double              halfWidth, // half-width of narrow band in voxel units
+                   const Vec3d&        origin, // origin of grid in world units
+                   const std::string&  name, // name of grid
+                   StatsMode           sMode, // mode of computation for the statistics
+                   CheckMode           cMode, // mode of computation for the checksum
+                   const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initBBox<BuildT>(width, height, depth, thickness, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
+    assert(handle);
+    return handle;
+} // createLevelSetBBox<T>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetBBox(double              width, // width of bbox in world units
+                   double              height, // height of bbox in world units
+                   double              depth, // depth of bbox in world units
+                   double              thickness, // thickness of the wire in world units
+                   const Vec3d&        center, // center of bbox in world units
+                   double              voxelSize, // size of a voxel in world units
+                   double              halfWidth, // half-width of narrow band in voxel units
+                   const Vec3d&        origin, // origin of grid in world units
+                   const std::string&  name, // name of grid
+                   StatsMode           sMode, // mode of computation for the statistics
+                   CheckMode           cMode, // mode of computation for the checksum
+                   float               tolerance,
+                   bool                ditherOn,
+                   const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initBBox<BuildT>(width, height, depth, thickness, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    converter.enableDithering(ditherOn);
+    AbsDiff oracle(tolerance);
+    auto handle = converter.template getHandle<BuildT, AbsDiff, BufferT>(oracle, buffer);
+    assert(handle);
+    return handle;
+} // createLevelSetBBox<FpN>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeBox(double              width, // width of box in world units
+                   double              height, // height of box in world units
+                   double              depth, // depth of box in world units
+                   const Vec3d&        center, // center of box in world units
+                   double              voxelSize, // size of a voxel in world units
+                   double              halfWidth, // half-width of narrow band in voxel units
+                   const Vec3d&        origin, // origin of grid in world units
+                   const std::string&  name, // name of grid
+                   StatsMode           sMode, // mode of computation for the statistics
+                   CheckMode           cMode, // mode of computation for the checksum
+                   const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initBox<BuildT>(width, height, depth, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    build::levelSetToFog(mgr, false);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
+    assert(handle);
+    return handle;
+} // createFogVolumeBox<T>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeBox(double              width, // width of box in world units
+                   double              height, // height of box in world units
+                   double              depth, // depth of box in world units
+                   const Vec3d&        center, // center of box in world units
+                   double              voxelSize, // size of a voxel in world units
+                   double              halfWidth, // half-width of narrow band in voxel units
+                   const Vec3d&        origin, // origin of grid in world units
+                   const std::string&  name, // name of grid
+                   StatsMode           sMode, // mode of computation for the statistics
+                   CheckMode           cMode, // mode of computation for the checksum
+                   float               tolerance,
+                   bool                ditherOn,
+                   const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initBox<BuildT>(width, height, depth, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    build::levelSetToFog(mgr, false);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    converter.enableDithering(ditherOn);
+    AbsDiff oracle(tolerance);
+    auto handle = converter.template getHandle<BuildT, AbsDiff, BufferT>(oracle, buffer);
+    assert(handle);
+    return handle;
+} // createFogVolumeBox<FpN>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeOctahedron(double              scale, // scale of octahedron in world units
+                          const Vec3d&        center, // center of box in world units
+                          double              voxelSize, // size of a voxel in world units
+                          double              halfWidth, // half-width of narrow band in voxel units
+                          const Vec3d&        origin, // origin of grid in world units
+                          const std::string&  name, // name of grid
+                          StatsMode           sMode, // mode of computation for the statistics
+                          CheckMode           cMode, // mode of computation for the checksum
+                          const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initOctahedron<BuildT>(scale, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    build::levelSetToFog(mgr, false);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
+    assert(handle);
+    return handle;
+} // createFogVolumeOctahedron<T>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeOctahedron(double              scale, // scale of octahedron in world units
+                          const Vec3d&        center, // center of box in world units
+                          double              voxelSize, // size of a voxel in world units
+                          double              halfWidth, // half-width of narrow band in voxel units
+                          const Vec3d&        origin, // origin of grid in world units
+                          const std::string&  name, // name of grid
+                          StatsMode           sMode, // mode of computation for the statistics
+                          CheckMode           cMode, // mode of computation for the checksum
+                          float               tolerance,
+                          bool                ditherOn,
+                          const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initOctahedron<BuildT>(scale, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    build::levelSetToFog(mgr, false);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    converter.enableDithering(ditherOn);
+    AbsDiff oracle(tolerance);
+    auto handle = converter.template getHandle<BuildT, AbsDiff, BufferT>(oracle, buffer);
+    assert(handle);
+    return handle;
+} // createFogVolumeOctahedron<FpN>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createPointBox(int                 pointsPerVoxel, // number of points to be scattered in each active voxel
+               double              width, // width of box in world units
+               double              height, // height of box in world units
+               double              depth, // depth of box in world units
+               const Vec3d&        center, // center of box in world units
+               double              voxelSize, // size of a voxel in world units
+               const Vec3d&        origin, // origin of grid in world units
+               const std::string&  name, // name of grid
+               CheckMode           cMode, // mode of computation for the checksum
+               const BufferT&      buffer)
+{
+    auto boxHandle = createLevelSetBox(width, height, depth, center, voxelSize, 0.5, origin, "dummy",
+                                       StatsMode::BBox, CheckMode::Disable, buffer);
+    assert(boxHandle);
+    auto* boxGrid = boxHandle.template grid<BuildT>();
+    assert(boxGrid);
+    auto pointHandle = createPointScatter(*boxGrid, pointsPerVoxel, name, cMode, buffer);
+    assert(pointHandle);
+    return pointHandle;
+} // createPointBox<T>
+
+//================================================================================================
+
+template<typename SrcBuildT, typename BufferT>
+inline GridHandle<BufferT>
+createPointScatter(const NanoGrid<SrcBuildT>& srcGrid, // origin of grid in world units
+                   int                        pointsPerVoxel, // number of points to be scattered in each active voxel
+                   const std::string&         name, // name of grid
+                   CheckMode                  cMode, // mode of computation for the checksum
+                   const BufferT&             buffer)
+{
+    using ValueT = typename BuildToValueMap<SrcBuildT>::type;
+    static_assert(util::is_floating_point<ValueT>::value, "createPointScatter: expect floating point");
+    using Vec3T = math::Vec3<ValueT>;
+    if (pointsPerVoxel < 1) {
+        throw std::runtime_error("createPointScatter: Expected at least one point per voxel");
+    }
+    if (!srcGrid.isLevelSet()) {
+        throw std::runtime_error("createPointScatter: Expected a level set grid");
+    }
+    if (!srcGrid.hasBBox()) {
+        throw std::runtime_error("createPointScatter: ActiveVoxelCount is required");
+    }
+    const uint64_t pointCount = pointsPerVoxel * srcGrid.activeVoxelCount();
+    if (pointCount == 0) {
+        throw std::runtime_error("createPointScatter: No particles to scatter");
+    }
+    std::vector<Vec3T> xyz;
+    xyz.reserve(pointCount);
+    using DstGridT = build::Grid<uint32_t>;
+    DstGridT dstGrid(std::numeric_limits<uint32_t>::max(), name, GridClass::PointData);
+    dstGrid.mMap = srcGrid.map();
+    auto dstAcc = dstGrid.getAccessor();
+    std::srand(1234);
+    const ValueT s = 1 / (1 + ValueT(RAND_MAX)); // scale so s*rand() is in ] 0, 1 [
+    // return a point with random local voxel coordinates (-0.5 to +0.5)
+    auto randomPoint = [&s](){return s * Vec3T(rand(), rand(), rand()) - Vec3T(0.5);};
+    const auto& srcTree = srcGrid.tree();
+    auto srcMgrHandle = createNodeManager(srcGrid);
+    auto *srcMgr = srcMgrHandle.template mgr<SrcBuildT>();
+    assert(srcMgr);
+    for (uint32_t i = 0, end = srcTree.nodeCount(0); i < end; ++i) {
+        auto& srcLeaf = srcMgr->leaf(i);
+        auto* dstLeaf = dstAcc.setValue(srcLeaf.origin(), pointsPerVoxel); // allocates leaf node
+        dstLeaf->mValueMask = srcLeaf.valueMask();
+        for (uint32_t j = 0, m = 0; j < 512; ++j) {
+            if (dstLeaf->mValueMask.isOn(j)) {
+                const Vec3f ijk = dstLeaf->offsetToGlobalCoord(j).asVec3s();// floating-point representatrion of index coorindates
+                for (int n = 0; n < pointsPerVoxel; ++n) xyz.push_back(srcGrid.indexToWorld(randomPoint() + ijk));
+                m += pointsPerVoxel;
+            }// active voxels
+            dstLeaf->mValues[j] = m;
+        }// loop over all voxels
+    }// loop over leaf nodes
+    assert(pointCount == xyz.size());
+    CreateNanoGrid<DstGridT> converter(dstGrid);
+    converter.setStats(StatsMode::MinMax);
+    converter.setChecksum(CheckMode::Disable);
+
+    converter.addBlindData(name,
+                           GridBlindDataSemantic::WorldCoords,
+                           GridBlindDataClass::AttributeArray,
+                           toGridType<Vec3T>(),
+                           pointCount,
+                           sizeof(Vec3T));
+    auto handle = converter.template getHandle<uint32_t>(buffer);
+    assert(handle);
+
+    auto* grid = handle.template grid<uint32_t>();
+    assert(grid && grid->template isSequential<0>());
+    auto &tree = grid->tree();
+    if (tree.nodeCount(0) == 0) throw std::runtime_error("Expect leaf nodes!");
+    auto *leafData = tree.getFirstLeaf()->data();
+    leafData[0].mMinimum = 0; // start of prefix sum
+    for (uint32_t i = 1, n = tree.nodeCount(0); i < n; ++i) {
+        leafData[i].mMinimum = leafData[i - 1].mMinimum + leafData[i - 1].mMaximum;
+    }
+    if (Vec3T *blindData = grid->template getBlindData<Vec3T>(0)) {
+        memcpy(blindData, xyz.data(), xyz.size() * sizeof(Vec3T));
+    } else {
+        throw std::runtime_error("Blind data pointer was NULL");
+    }
+    updateChecksum(grid, cMode);
+    return handle;
+} // createPointScatter
+
+}// namespace tools
+
+} // namespace nanovdb
+
+#endif // NANOVDB_TOOLS_PRIMITIVES_H_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/tools/GridBuilder.h b/nanovdb/nanovdb/tools/GridBuilder.h
new file mode 100644
index 0000000000..3072a59348
--- /dev/null
+++ b/nanovdb/nanovdb/tools/GridBuilder.h
@@ -0,0 +1,2315 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    \file nanovdb/tools/GridBuilder.h
+
+    \author Ken Museth
+
+    \date June 26, 2020
+
+    \brief This file defines a minimum set of tree nodes and tools that
+           can be used (instead of OpenVDB) to build nanovdb grids on the CPU.
+*/
+
+#ifndef NANOVDB_TOOLS_BUILD_GRIDBUILDER_H_HAS_BEEN_INCLUDED
+#define NANOVDB_TOOLS_BUILD_GRIDBUILDER_H_HAS_BEEN_INCLUDED
+
+#include <iostream>
+
+#include <map>
+#include <limits>
+#include <sstream> // for stringstream
+#include <vector>
+#include <cstring> // for memcpy
+#include <mutex>
+#include <array>
+#include <atomic>
+
+#include <nanovdb/NanoVDB.h>
+#include <nanovdb/util/Range.h>
+#include <nanovdb/util/ForEach.h>
+
+namespace nanovdb {
+
+namespace tools::build {
+
+// ----------------------------> Froward decelerations of random access methods <--------------------------------------
+
+template <typename T> struct GetValue;
+template <typename T> struct SetValue;
+template <typename T> struct TouchLeaf;
+template <typename T> struct GetState;
+template <typename T> struct ProbeValue;
+
+// ----------------------------> RootNode <--------------------------------------
+
+template<typename ChildT>
+struct RootNode
+{
+    using ValueType = typename ChildT::ValueType;
+    using BuildType = typename ChildT::BuildType;
+    using ChildNodeType = ChildT;
+    using LeafNodeType = typename ChildT::LeafNodeType;
+    static constexpr uint32_t LEVEL = 1 + ChildT::LEVEL; // level 0 = leaf
+    struct Tile {
+        Tile(ChildT* c = nullptr) : child(c) {}
+        Tile(const ValueType& v, bool s) : child(nullptr), value(v), state(s) {}
+        bool isChild() const { return child!=nullptr; }
+        bool isValue() const { return child==nullptr; }
+        bool isActive() const { return child==nullptr && state; }
+        ChildT*   child;
+        ValueType value;
+        bool      state;
+    };
+    using MapT = std::map<Coord, Tile>;
+    MapT      mTable;
+    ValueType mBackground;
+
+    Tile* probeTile(const Coord &ijk) {
+        auto iter = mTable.find(CoordToKey(ijk));
+        return iter == mTable.end() ? nullptr : &(iter->second);
+    }
+
+    const Tile* probeTile(const Coord &ijk) const {
+        auto iter = mTable.find(CoordToKey(ijk));
+        return iter == mTable.end() ? nullptr : &(iter->second);
+    }
+
+    class ChildIterator
+    {
+        const RootNode *mParent;
+        typename MapT::const_iterator mIter;
+    public:
+        ChildIterator() : mParent(nullptr), mIter() {}
+        ChildIterator(const RootNode *parent) : mParent(parent), mIter(parent->mTable.begin()) {
+            while (mIter!=parent->mTable.end() && mIter->second.child==nullptr) ++mIter;
+        }
+        ChildIterator& operator=(const ChildIterator&) = default;
+        ChildT& operator*() const {NANOVDB_ASSERT(*this); return *mIter->second.child;}
+        ChildT* operator->() const {NANOVDB_ASSERT(*this); return mIter->second.child;}
+        Coord getOrigin() const { NANOVDB_ASSERT(*this); return mIter->first;}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mIter->first;}
+        operator bool() const {return mParent && mIter!=mParent->mTable.end();}
+        ChildIterator& operator++() {
+            NANOVDB_ASSERT(mParent);
+            ++mIter;
+            while (mIter!=mParent->mTable.end() && mIter->second.child==nullptr) ++mIter;
+            return *this;
+        }
+        ChildIterator operator++(int) {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+        uint32_t pos() const {
+            NANOVDB_ASSERT(mParent);
+            return uint32_t(std::distance(mParent->mTable.begin(), mIter));
+        }
+    }; // Member class ChildIterator
+
+    ChildIterator  cbeginChild()  const {return ChildIterator(this);}
+    ChildIterator cbeginChildOn() const {return ChildIterator(this);}// match openvdb
+
+    class ValueIterator
+    {
+        const RootNode *mParent;
+        typename MapT::const_iterator mIter;
+    public:
+        ValueIterator() : mParent(nullptr), mIter() {}
+        ValueIterator(const RootNode *parent) : mParent(parent), mIter(parent->mTable.begin()) {
+            while (mIter!=parent->mTable.end() && mIter->second.child!=nullptr) ++mIter;
+        }
+        ValueIterator& operator=(const ValueIterator&) = default;
+        ValueType operator*() const {NANOVDB_ASSERT(*this); return mIter->second.value;}
+        bool isActive() const {NANOVDB_ASSERT(*this); return mIter->second.state;}
+        Coord getOrigin() const { NANOVDB_ASSERT(*this); return mIter->first;}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mIter->first;}
+        operator bool() const {return mParent && mIter!=mParent->mTable.end();}
+        ValueIterator& operator++() {
+            NANOVDB_ASSERT(mParent);
+            ++mIter;
+            while (mIter!=mParent->mTable.end() && mIter->second.child!=nullptr) ++mIter;
+            return *this;;
+        }
+        ValueIterator operator++(int) {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+        uint32_t pos() const {
+            NANOVDB_ASSERT(mParent);
+            return uint32_t(std::distance(mParent->mTable.begin(), mIter));
+        }
+    }; // Member class ValueIterator
+
+    ValueIterator  beginValue()          {return ValueIterator(this);}
+    ValueIterator cbeginValueAll() const {return ValueIterator(this);}
+
+    class ValueOnIterator
+    {
+        const RootNode *mParent;
+        typename MapT::const_iterator mIter;
+    public:
+        ValueOnIterator() : mParent(nullptr), mIter() {}
+        ValueOnIterator(const RootNode *parent) : mParent(parent), mIter(parent->mTable.begin()) {
+            while (mIter!=parent->mTable.end() && (mIter->second.child!=nullptr || !mIter->second.state)) ++mIter;
+        }
+        ValueOnIterator& operator=(const ValueOnIterator&) = default;
+        ValueType operator*() const {NANOVDB_ASSERT(*this); return mIter->second.value;}
+        Coord getOrigin() const { NANOVDB_ASSERT(*this); return mIter->first;}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mIter->first;}
+        operator bool() const {return mParent && mIter!=mParent->mTable.end();}
+        ValueOnIterator& operator++() {
+            NANOVDB_ASSERT(mParent);
+            ++mIter;
+            while (mIter!=mParent->mTable.end() && (mIter->second.child!=nullptr || !mIter->second.state)) ++mIter;
+            return *this;;
+        }
+        ValueOnIterator operator++(int) {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+        uint32_t pos() const {
+            NANOVDB_ASSERT(mParent);
+            return uint32_t(std::distance(mParent->mTable.begin(), mIter));
+        }
+    }; // Member class ValueOnIterator
+
+    ValueOnIterator  beginValueOn()       {return ValueOnIterator(this);}
+    ValueOnIterator cbeginValueOn() const {return ValueOnIterator(this);}
+
+    class TileIterator
+    {
+        const RootNode *mParent;
+        typename MapT::const_iterator mIter;
+    public:
+        TileIterator() : mParent(nullptr), mIter() {}
+        TileIterator(const RootNode *parent) : mParent(parent), mIter(parent->mTable.begin()) {
+            NANOVDB_ASSERT(mParent);
+        }
+        TileIterator& operator=(const TileIterator&) = default;
+        const Tile& operator*() const {NANOVDB_ASSERT(*this); return mIter->second;}
+        const Tile* operator->() const {NANOVDB_ASSERT(*this); return &(mIter->second);}
+        Coord getOrigin() const { NANOVDB_ASSERT(*this); return mIter->first;}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mIter->first;}
+        operator bool() const {return mParent && mIter!=mParent->mTable.end();}
+        const ChildT* probeChild(ValueType &value) {
+            NANOVDB_ASSERT(*this);
+            const ChildT *child = mIter->second.child;
+            if (child==nullptr) value = mIter->second.value;
+            return child;
+        }
+        bool isValueOn() const {return mIter->second.child==nullptr && mIter->second.state;}
+        TileIterator& operator++() {
+            NANOVDB_ASSERT(mParent);
+            ++mIter;
+            return *this;
+        }
+        TileIterator operator++(int) {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+        uint32_t pos() const {
+            NANOVDB_ASSERT(mParent);
+            return uint32_t(std::distance(mParent->mTable.begin(), mIter));
+        }
+    }; // Member class TileIterator
+
+    TileIterator  beginTile()           {return TileIterator(this);}
+    TileIterator cbeginChildAll() const {return TileIterator(this);}
+
+    //class DenseIterator : public TileIterator
+
+    RootNode(const ValueType& background) : mBackground(background) {}
+    RootNode(const RootNode&) = delete; // disallow copy-construction
+    RootNode(RootNode&&) = default; // allow move construction
+    RootNode& operator=(const RootNode&) = delete; // disallow copy assignment
+    RootNode& operator=(RootNode&&) = default; // allow move assignment
+
+    ~RootNode() { this->clear(); }
+
+    uint32_t tileCount()    const { return uint32_t(mTable.size()); }
+    uint32_t getTableSize() const { return uint32_t(mTable.size()); }// match openvdb
+    const ValueType& background() const {return mBackground;}
+
+    void nodeCount(std::array<size_t,3> &count) const
+    {
+        for (auto it = this->cbeginChild(); it; ++it) {
+            count[ChildT::LEVEL] += 1;
+            it->nodeCount(count);
+        }
+    }
+
+    bool empty() const { return mTable.empty(); }
+
+    void clear()
+    {
+        for (auto iter = mTable.begin(); iter != mTable.end(); ++iter) delete iter->second.child;
+        mTable.clear();
+    }
+
+    static Coord CoordToKey(const Coord& ijk) { return ijk & ~ChildT::MASK; }
+
+#ifdef NANOVDB_NEW_ACCESSOR_METHODS
+    template<typename OpT, typename... ArgsT>
+    auto get(const Coord& ijk, ArgsT&&... args) const
+    {
+        if (const Tile *tile = this->probeTile(ijk)) {
+            if (auto *child = tile->child) return child->template get<OpT>(ijk, args...);
+            return OpT::get(*tile, args...);
+        }
+        return OpT::get(*this, args...);
+    }
+    template<typename OpT, typename... ArgsT>
+    auto set(const Coord& ijk, ArgsT&&... args)
+    {
+        ChildT* child = nullptr;
+        const Coord key = CoordToKey(ijk);
+        auto iter = mTable.find(key);
+        if (iter == mTable.end()) {
+            child = new ChildT(ijk, mBackground, false);
+            mTable[key] = Tile(child);
+        } else if (iter->second.child != nullptr) {
+            child = iter->second.child;
+        } else {
+            child = new ChildT(ijk, iter->second.value, iter->second.state);
+            iter->second.child = child;
+        }
+        NANOVDB_ASSERT(child);
+        return child->template set<OpT>(ijk, args...);
+    }
+    template<typename OpT, typename AccT, typename... ArgsT>
+    auto getAndCache(const Coord& ijk, const AccT& acc, ArgsT&&... args) const
+    {
+        if (const Tile *tile = this->probeTile(ijk)) {
+            if (auto *child = tile->child) {
+                acc.insert(ijk, child);
+                return child->template get<OpT>(ijk, args...);
+            }
+            return OpT::get(*tile, args...);
+        }
+        return OpT::get(*this, args...);
+    }
+
+    template<typename OpT, typename AccT, typename... ArgsT>
+    auto setAndCache(const Coord& ijk, const AccT& acc, ArgsT&&... args)
+    {
+        ChildT* child = nullptr;
+        const Coord key = CoordToKey(ijk);
+        auto iter = mTable.find(key);
+        if (iter == mTable.end()) {
+            child = new ChildT(ijk, mBackground, false);
+            mTable[key] = Tile(child);
+        } else if (iter->second.child != nullptr) {
+            child = iter->second.child;
+        } else {
+            child = new ChildT(ijk, iter->second.value, iter->second.state);
+            iter->second.child = child;
+        }
+        NANOVDB_ASSERT(child);
+        acc.insert(ijk, child);
+        return child->template setAndCache<OpT>(ijk, acc, args...);
+    }
+    ValueType getValue(const Coord& ijk) const {return this->template get<GetValue<BuildType>>(ijk);}
+    ValueType getValue(int i, int j, int k) const {return this->template get<GetValue<BuildType>>(Coord(i,j,k));}
+    ValueType operator()(const Coord& ijk) const {return this->template get<GetValue<BuildType>>(ijk);}
+    ValueType operator()(int i, int j, int k) const {return this->template get<GetValue<BuildType>>(Coord(i,j,k));}
+    void setValue(const Coord& ijk, const ValueType& value) {this->template set<SetValue<BuildType>>(ijk, value);}
+    bool probeValue(const Coord& ijk, ValueType& value) const {return this->template get<ProbeValue<BuildType>>(ijk, value);}
+    bool isActive(const Coord& ijk) const {return this->template get<GetState<BuildType>>(ijk);}
+#else
+    ValueType getValue(const Coord& ijk) const
+    {
+#if 1
+        if (auto *tile = this->probeTile(ijk)) return tile->child ? tile->child->getValue(ijk) : tile->value;
+        return mBackground;
+#else
+        auto iter = mTable.find(CoordToKey(ijk));
+        if (iter == mTable.end()) {
+            return mBackground;
+        } else if (iter->second.child) {
+            return iter->second.child->getValue(ijk);
+        } else {
+            return iter->second.value;
+        }
+#endif
+    }
+    ValueType getValue(int i, int j, int k) const {return this->getValue(Coord(i,j,k));}
+
+    void setValue(const Coord& ijk, const ValueType& value)
+    {
+        ChildT* child = nullptr;
+        const Coord key = CoordToKey(ijk);
+        auto iter = mTable.find(key);
+        if (iter == mTable.end()) {
+            child = new ChildT(ijk, mBackground, false);
+            mTable[key] = Tile(child);
+        } else if (iter->second.child != nullptr) {
+            child = iter->second.child;
+        } else {
+            child = new ChildT(ijk, iter->second.value, iter->second.state);
+            iter->second.child = child;
+        }
+        NANOVDB_ASSERT(child);
+        child->setValue(ijk, value);
+    }
+
+    template<typename AccT>
+    bool isActiveAndCache(const Coord& ijk, AccT& acc) const
+    {
+        auto iter = mTable.find(CoordToKey(ijk));
+        if (iter == mTable.end())
+            return false;
+        if (iter->second.child) {
+            acc.insert(ijk, iter->second.child);
+            return iter->second.child->isActiveAndCache(ijk, acc);
+        }
+        return iter->second.state;
+    }
+
+    template<typename AccT>
+    ValueType getValueAndCache(const Coord& ijk, AccT& acc) const
+    {
+        auto iter = mTable.find(CoordToKey(ijk));
+        if (iter == mTable.end())
+            return mBackground;
+        if (iter->second.child) {
+            acc.insert(ijk, iter->second.child);
+            return iter->second.child->getValueAndCache(ijk, acc);
+        }
+        return iter->second.value;
+    }
+
+    template<typename AccT>
+    void setValueAndCache(const Coord& ijk, const ValueType& value, AccT& acc)
+    {
+        ChildT* child = nullptr;
+        const Coord key = CoordToKey(ijk);
+        auto iter = mTable.find(key);
+        if (iter == mTable.end()) {
+            child = new ChildT(ijk, mBackground, false);
+            mTable[key] = Tile(child);
+        } else if (iter->second.child != nullptr) {
+            child = iter->second.child;
+        } else {
+            child = new ChildT(ijk, iter->second.value, iter->second.state);
+            iter->second.child = child;
+        }
+        NANOVDB_ASSERT(child);
+        acc.insert(ijk, child);
+        child->setValueAndCache(ijk, value, acc);
+    }
+    template<typename AccT>
+    void setValueOnAndCache(const Coord& ijk, AccT& acc)
+    {
+        ChildT* child = nullptr;
+        const Coord key = CoordToKey(ijk);
+        auto iter = mTable.find(key);
+        if (iter == mTable.end()) {
+            child = new ChildT(ijk, mBackground, false);
+            mTable[key] = Tile(child);
+        } else if (iter->second.child != nullptr) {
+            child = iter->second.child;
+        } else {
+            child = new ChildT(ijk, iter->second.value, iter->second.state);
+            iter->second.child = child;
+        }
+        NANOVDB_ASSERT(child);
+        acc.insert(ijk, child);
+        child->setValueOnAndCache(ijk, acc);
+    }
+    template<typename AccT>
+    void touchLeafAndCache(const Coord &ijk, AccT& acc)
+    {
+        ChildT* child = nullptr;
+        const Coord key = CoordToKey(ijk);
+        auto iter = mTable.find(key);
+        if (iter == mTable.end()) {
+            child = new ChildT(ijk, mBackground, false);
+            mTable[key] = Tile(child);
+        } else if (iter->second.child != nullptr) {
+            child = iter->second.child;
+        } else {
+            child = new ChildT(ijk, iter->second.value, iter->second.state);
+            iter->second.child = child;
+        }
+        acc.insert(ijk, child);
+        child->touchLeafAndCache(ijk, acc);
+    }
+#endif// NANOVDB_NEW_ACCESSOR_METHODS
+
+    template<typename NodeT>
+    uint32_t nodeCount() const
+    {
+        static_assert(util::is_same<ValueType, typename NodeT::ValueType>::value, "Root::getNodes: Invalid type");
+        static_assert(NodeT::LEVEL < LEVEL, "Root::getNodes: LEVEL error");
+        uint32_t sum = 0;
+        for (auto iter = mTable.begin(); iter != mTable.end(); ++iter) {
+            if (iter->second.child == nullptr) continue; // skip tiles
+            if constexpr(util::is_same<NodeT, ChildT>::value) { //resolved at compile-time
+                ++sum;
+            } else {
+                sum += iter->second.child->template nodeCount<NodeT>();
+            }
+        }
+        return sum;
+    }
+
+    template<typename NodeT>
+    void getNodes(std::vector<NodeT*>& array)
+    {
+        static_assert(util::is_same<ValueType, typename NodeT::ValueType>::value, "Root::getNodes: Invalid type");
+        static_assert(NodeT::LEVEL < LEVEL, "Root::getNodes: LEVEL error");
+        for (auto iter = mTable.begin(); iter != mTable.end(); ++iter) {
+            if (iter->second.child == nullptr)
+                continue;
+            if constexpr(util::is_same<NodeT, ChildT>::value) { //resolved at compile-time
+                array.push_back(reinterpret_cast<NodeT*>(iter->second.child));
+            } else {
+                iter->second.child->getNodes(array);
+            }
+        }
+    }
+
+    void addChild(ChildT*& child)
+    {
+        NANOVDB_ASSERT(child);
+        const Coord key = CoordToKey(child->mOrigin);
+        auto iter = mTable.find(key);
+        if (iter != mTable.end() && iter->second.child != nullptr) { // existing child node
+            delete iter->second.child;
+            iter->second.child = child;
+        } else {
+            mTable[key] = Tile(child);
+        }
+        child = nullptr;
+    }
+
+    /// @brief Add a tile containing voxel (i, j, k) at the specified tree level,
+    /// creating a new branch if necessary.  Delete any existing lower-level nodes
+    /// that contain (x, y, z).
+    /// @tparam level tree level at which the tile is inserted. Must be 1, 2 or 3.
+    /// @param ijk Index coordinate that map to the tile being inserted
+    /// @param value Value of the tile
+    /// @param state Binary state of the tile
+    template <uint32_t level>
+    void addTile(const Coord& ijk, const ValueType& value, bool state)
+    {
+        static_assert(level > 0 && level <= LEVEL, "invalid template value of level");
+        const Coord key = CoordToKey(ijk);
+        auto        iter = mTable.find(key);
+        if constexpr(level == LEVEL) {
+            if (iter == mTable.end()) {
+                mTable[key] = Tile(value, state);
+            } else if (iter->second.child == nullptr) {
+                iter->second.value = value;
+                iter->second.state = state;
+            } else {
+                delete iter->second.child;
+                iter->second.child = nullptr;
+                iter->second.value = value;
+                iter->second.state = state;
+            }
+        } else if constexpr(level < LEVEL) {
+            ChildT* child = nullptr;
+            if (iter == mTable.end()) {
+                child = new ChildT(ijk, mBackground, false);
+                mTable[key] = Tile(child);
+            } else if (iter->second.child != nullptr) {
+                child = iter->second.child;
+            } else {
+                child = new ChildT(ijk, iter->second.value, iter->second.state);
+                iter->second.child = child;
+            }
+            child->template addTile<level>(ijk, value, state);
+        }
+    }
+
+    template<typename NodeT>
+    void addNode(NodeT*& node)
+    {
+        if constexpr(util::is_same<NodeT, ChildT>::value) { //resolved at compile-time
+            this->addChild(reinterpret_cast<ChildT*&>(node));
+        } else {
+            ChildT*     child = nullptr;
+            const Coord key = CoordToKey(node->mOrigin);
+            auto        iter = mTable.find(key);
+            if (iter == mTable.end()) {
+                child = new ChildT(node->mOrigin, mBackground, false);
+                mTable[key] = Tile(child);
+            } else if (iter->second.child != nullptr) {
+                child = iter->second.child;
+            } else {
+                child = new ChildT(node->mOrigin, iter->second.value, iter->second.state);
+                iter->second.child = child;
+            }
+            child->addNode(node);
+        }
+    }
+
+    void merge(RootNode &other)
+    {
+        for (auto iter1 = other.mTable.begin(); iter1 != other.mTable.end(); ++iter1) {
+            if (iter1->second.child == nullptr) continue;// ignore input tiles
+            auto iter2 = mTable.find(iter1->first);
+            if (iter2 == mTable.end() || iter2->second.child == nullptr) {
+                mTable[iter1->first] = Tile(iter1->second.child);
+                iter1->second.child = nullptr;
+            } else {
+                iter2->second.child->merge(*iter1->second.child);
+            }
+        }
+        other.clear();
+    }
+
+    template<typename T>
+    typename util::enable_if<std::is_floating_point<T>::value>::type
+    signedFloodFill(T outside);
+
+}; // tools::build::RootNode
+
+//================================================================================================
+
+template<typename ChildT>
+template<typename T>
+inline typename util::enable_if<std::is_floating_point<T>::value>::type
+RootNode<ChildT>::signedFloodFill(T outside)
+{
+    std::map<Coord, ChildT*> nodeKeys;
+    for (auto iter = mTable.begin(); iter != mTable.end(); ++iter) {
+        if (iter->second.child == nullptr)
+            continue;
+        nodeKeys.insert(std::pair<Coord, ChildT*>(iter->first, iter->second.child));
+    }
+
+    // We employ a simple z-scanline algorithm that inserts inactive tiles with
+    // the inside value if they are sandwiched between inside child nodes only!
+    auto b = nodeKeys.begin(), e = nodeKeys.end();
+    if (b == e)
+        return;
+    for (auto a = b++; b != e; ++a, ++b) {
+        Coord d = b->first - a->first; // delta of neighboring coordinates
+        if (d[0] != 0 || d[1] != 0 || d[2] == int(ChildT::DIM))
+            continue; // not same z-scanline or neighbors
+        const ValueType fill[] = {a->second->getLastValue(), b->second->getFirstValue()};
+        if (!(fill[0] < 0) || !(fill[1] < 0))
+            continue; // scanline isn't inside
+        Coord c = a->first + Coord(0u, 0u, ChildT::DIM);
+        for (; c[2] != b->first[2]; c[2] += ChildT::DIM) {
+            const Coord key = RootNode<ChildT>::CoordToKey(c);
+            mTable[key] = typename RootNode<ChildT>::Tile(-outside, false); // inactive tile
+        }
+    }
+} // tools::build::RootNode::signedFloodFill
+
+// ----------------------------> InternalNode <--------------------------------------
+
+template<typename ChildT>
+struct InternalNode
+{
+    using ValueType = typename ChildT::ValueType;
+    using BuildType = typename ChildT::BuildType;
+    using ChildNodeType = ChildT;
+    using LeafNodeType = typename ChildT::LeafNodeType;
+    static constexpr uint32_t LOG2DIM = ChildT::LOG2DIM + 1;
+    static constexpr uint32_t TOTAL = LOG2DIM + ChildT::TOTAL; //dimension in index space
+    static constexpr uint32_t DIM = 1u << TOTAL;
+    static constexpr uint32_t SIZE = 1u << (3 * LOG2DIM); //number of tile values (or child pointers)
+    static constexpr uint32_t MASK = DIM - 1;
+    static constexpr uint32_t LEVEL = 1 + ChildT::LEVEL; // level 0 = leaf
+    static constexpr uint64_t NUM_VALUES = uint64_t(1) << (3 * TOTAL); // total voxel count represented by this node
+    using MaskT = Mask<LOG2DIM>;
+    template<bool On>
+    using MaskIterT = typename MaskT::template Iterator<On>;
+    using NanoNodeT = typename NanoNode<BuildType, LEVEL>::Type;
+
+    struct Tile {
+        Tile(ChildT* c = nullptr) : child(c) {}
+        Tile(const ValueType& v) : value(v) {}
+        union{
+            ChildT*   child;
+            ValueType value;
+        };
+    };
+    Coord      mOrigin;
+    MaskT      mValueMask;
+    MaskT      mChildMask;
+    Tile       mTable[SIZE];
+
+    union {
+        NanoNodeT *mDstNode;
+        uint64_t   mDstOffset;
+    };
+
+    /// @brief Visits child nodes of this node only
+    class ChildIterator : public MaskIterT<true>
+    {
+        using BaseT = MaskIterT<true>;
+        const InternalNode *mParent;
+    public:
+        ChildIterator() : BaseT(), mParent(nullptr) {}
+        ChildIterator(const InternalNode* parent) : BaseT(parent->mChildMask.beginOn()), mParent(parent) {}
+        ChildIterator& operator=(const ChildIterator&) = default;
+        const ChildT& operator*() const {NANOVDB_ASSERT(*this); return *mParent->mTable[BaseT::pos()].child;}
+        const ChildT* operator->() const {NANOVDB_ASSERT(*this); return mParent->mTable[BaseT::pos()].child;}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return (*this)->origin();}
+    }; // Member class ChildIterator
+
+    ChildIterator  beginChild()         {return ChildIterator(this);}
+    ChildIterator cbeginChildOn() const {return ChildIterator(this);}// match openvdb
+
+     /// @brief Visits all tile values in this node, i.e. both inactive and active tiles
+    class ValueIterator : public MaskIterT<false>
+    {
+        using BaseT = MaskIterT<false>;
+        const InternalNode *mParent;
+    public:
+        ValueIterator() : BaseT(), mParent(nullptr) {}
+        ValueIterator(const InternalNode* parent) :  BaseT(parent->mChildMask.beginOff()), mParent(parent) {}
+        ValueIterator& operator=(const ValueIterator&) = default;
+        ValueType operator*() const {NANOVDB_ASSERT(*this); return mParent->mTable[BaseT::pos()].value;}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(BaseT::pos());}
+        bool isActive() const { NANOVDB_ASSERT(*this); return mParent->mValueMask.isOn(BaseT::pos());}
+    }; // Member class ValueIterator
+
+    ValueIterator  beginValue()          {return ValueIterator(this);}
+    ValueIterator cbeginValueAll() const {return ValueIterator(this);}
+
+    /// @brief Visits active tile values of this node only
+    class ValueOnIterator : public MaskIterT<true>
+    {
+        using BaseT = MaskIterT<true>;
+        const InternalNode *mParent;
+    public:
+        ValueOnIterator() : BaseT(), mParent(nullptr) {}
+        ValueOnIterator(const InternalNode* parent) :  BaseT(parent->mValueMask.beginOn()), mParent(parent) {}
+        ValueOnIterator& operator=(const ValueOnIterator&) = default;
+        ValueType operator*() const {NANOVDB_ASSERT(*this); return mParent->mTable[BaseT::pos()].value;}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(BaseT::pos());}
+    }; // Member class ValueOnIterator
+
+    ValueOnIterator  beginValueOn()       {return ValueOnIterator(this);}
+    ValueOnIterator cbeginValueOn() const {return ValueOnIterator(this);}
+
+    /// @brief Visits all tile values and child nodes of this node
+    class DenseIterator : public MaskT::DenseIterator
+    {
+        using BaseT = typename MaskT::DenseIterator;
+        const InternalNode *mParent;
+    public:
+        DenseIterator() : BaseT(), mParent(nullptr) {}
+        DenseIterator(const InternalNode* parent) :  BaseT(0), mParent(parent) {}
+        DenseIterator& operator=(const DenseIterator&) = default;
+        ChildT* probeChild(ValueType& value) const
+        {
+            NANOVDB_ASSERT(mParent && bool(*this));
+            ChildT *child = nullptr;
+            if (mParent->mChildMask.isOn(BaseT::pos())) {
+                child = mParent->mTable[BaseT::pos()].child;
+            } else {
+                value = mParent->mTable[BaseT::pos()].value;
+            }
+            return child;
+        }
+        Coord getCoord() const { NANOVDB_ASSERT(mParent && bool(*this)); return mParent->offsetToGlobalCoord(BaseT::pos());}
+    }; // Member class DenseIterator
+
+    DenseIterator     beginDense()       {return DenseIterator(this);}
+    DenseIterator cbeginChildAll() const {return DenseIterator(this);}// matches openvdb
+
+    InternalNode(const Coord& origin, const ValueType& value, bool state)
+        : mOrigin(origin & ~MASK)
+        , mValueMask(state)
+        , mChildMask()
+        , mDstOffset(0)
+    {
+        for (uint32_t i = 0; i < SIZE; ++i) mTable[i].value = value;
+    }
+    InternalNode(const InternalNode&) = delete; // disallow copy-construction
+    InternalNode(InternalNode&&) = delete; // disallow move construction
+    InternalNode& operator=(const InternalNode&) = delete; // disallow copy assignment
+    InternalNode& operator=(InternalNode&&) = delete; // disallow move assignment
+    ~InternalNode()
+    {
+        for (auto iter = mChildMask.beginOn(); iter; ++iter) {
+            delete mTable[*iter].child;
+        }
+    }
+    const MaskT& getValueMask() const {return mValueMask;}
+    const MaskT& valueMask() const {return mValueMask;}
+    const MaskT& getChildMask() const {return mChildMask;}
+    const MaskT& childMask() const {return mChildMask;}
+    const Coord& origin() const {return mOrigin;}
+
+    void nodeCount(std::array<size_t,3> &count) const
+    {
+        count[ChildT::LEVEL] += mChildMask.countOn();
+        if constexpr(ChildT::LEVEL>0) {
+            for (auto it = const_cast<InternalNode*>(this)->beginChild(); it; ++it) it->nodeCount(count);
+        }
+    }
+
+    static uint32_t CoordToOffset(const Coord& ijk)
+    {
+        return (((ijk[0] & int32_t(MASK)) >> ChildT::TOTAL) << (2 * LOG2DIM)) +
+               (((ijk[1] & int32_t(MASK)) >> ChildT::TOTAL) << (LOG2DIM)) +
+                ((ijk[2] & int32_t(MASK)) >> ChildT::TOTAL);
+    }
+
+    static Coord OffsetToLocalCoord(uint32_t n)
+    {
+        NANOVDB_ASSERT(n < SIZE);
+        const uint32_t m = n & ((1 << 2 * LOG2DIM) - 1);
+        return Coord(n >> 2 * LOG2DIM, m >> LOG2DIM, m & ((1 << LOG2DIM) - 1));
+    }
+
+    void localToGlobalCoord(Coord& ijk) const
+    {
+        ijk <<= ChildT::TOTAL;
+        ijk += mOrigin;
+    }
+
+    Coord offsetToGlobalCoord(uint32_t n) const
+    {
+        Coord ijk = InternalNode::OffsetToLocalCoord(n);
+        this->localToGlobalCoord(ijk);
+        return ijk;
+    }
+
+    ValueType getFirstValue() const { return mChildMask.isOn(0) ? mTable[0].child->getFirstValue() : mTable[0].value; }
+    ValueType getLastValue() const { return mChildMask.isOn(SIZE - 1) ? mTable[SIZE - 1].child->getLastValue() : mTable[SIZE - 1].value; }
+
+    template<typename OpT, typename... ArgsT>
+    auto get(const Coord& ijk, ArgsT&&... args) const
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        if (mChildMask.isOn(n)) return mTable[n].child->template get<OpT>(ijk, args...);
+        return OpT::get(*this, n, args...);
+    }
+
+    template<typename OpT, typename... ArgsT>
+    auto set(const Coord& ijk, ArgsT&&... args)
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        ChildT* child = nullptr;
+        if (mChildMask.isOn(n)) {
+            child = mTable[n].child;
+        } else {
+            child = new ChildT(ijk, mTable[n].value, mValueMask.isOn(n));
+            mTable[n].child = child;
+            mChildMask.setOn(n);
+        }
+        NANOVDB_ASSERT(child);
+        return child->template set<OpT>(ijk, args...);
+    }
+
+    template<typename OpT, typename AccT, typename... ArgsT>
+    auto getAndCache(const Coord& ijk, const AccT& acc, ArgsT&&... args) const
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        if (mChildMask.isOff(n)) return OpT::get(*this, n, args...);
+        ChildT* child = mTable[n].child;
+        acc.insert(ijk, child);
+        if constexpr(ChildT::LEVEL == 0) {
+            return child->template get<OpT>(ijk, args...);
+        } else {
+            return child->template getAndCache<OpT>(ijk, acc, args...);
+        }
+    }
+
+    template<typename OpT, typename AccT, typename... ArgsT>
+    auto setAndCache(const Coord& ijk, const AccT& acc, ArgsT&&... args)
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        ChildT* child = nullptr;
+        if (mChildMask.isOn(n)) {
+            child = mTable[n].child;
+        } else {
+            child = new ChildT(ijk, mTable[n].value, mValueMask.isOn(n));
+            mTable[n].child = child;
+            mChildMask.setOn(n);
+        }
+        NANOVDB_ASSERT(child);
+        acc.insert(ijk, child);
+        if constexpr(ChildT::LEVEL == 0) {
+            return child->template set<OpT>(ijk, args...);
+        } else {
+            return child->template setAndCache<OpT>(ijk, acc, args...);
+        }
+    }
+
+#ifdef NANOVDB_NEW_ACCESSOR_METHODS
+    ValueType getValue(const Coord& ijk) const {return this->template get<GetValue<BuildType>>(ijk);}
+    LeafNodeType& setValue(const Coord& ijk, const ValueType& value){return this->template set<SetValue<BuildType>>(ijk, value);}
+#else
+    ValueType getValue(const Coord& ijk) const
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        if (mChildMask.isOn(n)) {
+            return mTable[n].child->getValue(ijk);
+        }
+        return mTable[n].value;
+    }
+    void setValue(const Coord& ijk, const ValueType& value)
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        ChildT*        child = nullptr;
+        if (mChildMask.isOn(n)) {
+            child = mTable[n].child;
+        } else {
+            child = new ChildT(ijk, mTable[n].value, mValueMask.isOn(n));
+            mTable[n].child = child;
+            mChildMask.setOn(n);
+        }
+        child->setValue(ijk, value);
+    }
+
+    template<typename AccT>
+    ValueType getValueAndCache(const Coord& ijk, AccT& acc) const
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        if (mChildMask.isOn(n)) {
+            acc.insert(ijk, const_cast<ChildT*>(mTable[n].child));
+            return mTable[n].child->getValueAndCache(ijk, acc);
+        }
+        return mTable[n].value;
+    }
+
+    template<typename AccT>
+    void setValueAndCache(const Coord& ijk, const ValueType& value, AccT& acc)
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        ChildT*        child = nullptr;
+        if (mChildMask.isOn(n)) {
+            child = mTable[n].child;
+        } else {
+            child = new ChildT(ijk, mTable[n].value, mValueMask.isOn(n));
+            mTable[n].child = child;
+            mChildMask.setOn(n);
+        }
+        acc.insert(ijk, child);
+        child->setValueAndCache(ijk, value, acc);
+    }
+
+    template<typename AccT>
+    void setValueOnAndCache(const Coord& ijk, AccT& acc)
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        ChildT*        child = nullptr;
+        if (mChildMask.isOn(n)) {
+            child = mTable[n].child;
+        } else {
+            child = new ChildT(ijk, mTable[n].value, mValueMask.isOn(n));
+            mTable[n].child = child;
+            mChildMask.setOn(n);
+        }
+        acc.insert(ijk, child);
+        child->setValueOnAndCache(ijk, acc);
+    }
+
+    template<typename AccT>
+    void touchLeafAndCache(const Coord &ijk, AccT& acc)
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        ChildT* child = nullptr;
+        if (mChildMask.isOn(n)) {
+            child = mTable[n].child;
+        } else {
+            child = new ChildT(ijk, mTable[n].value, mValueMask.isOn(n));
+            mTable[n].child = child;
+            mChildMask.setOn(n);
+        }
+        acc.insert(ijk, child);
+        if constexpr(LEVEL>1) child->touchLeafAndCache(ijk, acc);
+    }
+    template<typename AccT>
+    bool isActiveAndCache(const Coord& ijk, AccT& acc) const
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        if (mChildMask.isOn(n)) {
+            acc.insert(ijk, const_cast<ChildT*>(mTable[n].child));
+            return mTable[n].child->isActiveAndCache(ijk, acc);
+        }
+        return mValueMask.isOn(n);
+    }
+#endif
+
+    template<typename NodeT>
+    uint32_t nodeCount() const
+    {
+        static_assert(util::is_same<ValueType, typename NodeT::ValueType>::value, "Node::getNodes: Invalid type");
+        NANOVDB_ASSERT(NodeT::LEVEL < LEVEL);
+        uint32_t sum = 0;
+        if constexpr(util::is_same<NodeT, ChildT>::value) { // resolved at compile-time
+            sum += mChildMask.countOn();
+        } else if constexpr(LEVEL>1) {
+            for (auto iter = mChildMask.beginOn(); iter; ++iter) {
+                sum += mTable[*iter].child->template nodeCount<NodeT>();
+            }
+        }
+        return sum;
+    }
+
+    template<typename NodeT>
+    void getNodes(std::vector<NodeT*>& array)
+    {
+        static_assert(util::is_same<ValueType, typename NodeT::ValueType>::value, "Node::getNodes: Invalid type");
+        NANOVDB_ASSERT(NodeT::LEVEL < LEVEL);
+        for (auto iter = mChildMask.beginOn(); iter; ++iter) {
+            if constexpr(util::is_same<NodeT, ChildT>::value) { // resolved at compile-time
+                array.push_back(reinterpret_cast<NodeT*>(mTable[*iter].child));
+            } else if constexpr(LEVEL>1) {
+                mTable[*iter].child->getNodes(array);
+            }
+        }
+    }
+
+    void addChild(ChildT*& child)
+    {
+        NANOVDB_ASSERT(child && (child->mOrigin & ~MASK) == this->mOrigin);
+        const uint32_t n = CoordToOffset(child->mOrigin);
+        if (mChildMask.isOn(n)) {
+            delete mTable[n].child;
+        } else {
+            mChildMask.setOn(n);
+        }
+        mTable[n].child = child;
+        child = nullptr;
+    }
+
+    /// @brief Add a tile containing voxel (i, j, k) at the specified tree level,
+    /// creating a new branch if necessary.  Delete any existing lower-level nodes
+    /// that contain (x, y, z).
+    /// @tparam level tree level at which the tile is inserted. Must be 1 or 2.
+    /// @param ijk Index coordinate that map to the tile being inserted
+    /// @param value Value of the tile
+    /// @param state Binary state of the tile
+    template <uint32_t level>
+    void addTile(const Coord& ijk, const ValueType& value, bool state)
+    {
+        static_assert(level > 0 && level <= LEVEL, "invalid template value of level");
+        const uint32_t n = CoordToOffset(ijk);
+        if constexpr(level == LEVEL) {
+            if (mChildMask.isOn(n)) {
+                delete mTable[n].child;
+                mTable[n] = Tile(value);
+            } else {
+                mValueMask.set(n, state);
+                mTable[n].value = value;
+            }
+        } else if constexpr(level < LEVEL) {
+            ChildT* child = nullptr;
+            if (mChildMask.isOn(n)) {
+                child = mTable[n].child;
+            } else {
+                child = new ChildT(ijk, value, state);
+                mTable[n].child = child;
+                mChildMask.setOn(n);
+            }
+            child->template addTile<level>(ijk, value, state);
+        }
+    }
+
+    template<typename NodeT>
+    void addNode(NodeT*& node)
+    {
+        if constexpr(util::is_same<NodeT, ChildT>::value) { //resolved at compile-time
+            this->addChild(reinterpret_cast<ChildT*&>(node));
+        } else if constexpr(LEVEL>1) {
+            const uint32_t n = CoordToOffset(node->mOrigin);
+            ChildT*        child = nullptr;
+            if (mChildMask.isOn(n)) {
+                child = mTable[n].child;
+            } else {
+                child = new ChildT(node->mOrigin, mTable[n].value, mValueMask.isOn(n));
+                mTable[n].child = child;
+                mChildMask.setOn(n);
+            }
+            child->addNode(node);
+        }
+    }
+
+    void merge(InternalNode &other)
+    {
+        for (auto iter = other.mChildMask.beginOn(); iter; ++iter) {
+            const uint32_t n = *iter;
+            if (mChildMask.isOn(n)) {
+                mTable[n].child->merge(*other.mTable[n].child);
+            } else {
+                mTable[n].child = other.mTable[n].child;
+                other.mChildMask.setOff(n);
+                mChildMask.setOn(n);
+            }
+        }
+    }
+
+    template<typename T>
+    typename util::enable_if<std::is_floating_point<T>::value>::type
+    signedFloodFill(T outside);
+
+}; // tools::build::InternalNode
+
+//================================================================================================
+
+template<typename ChildT>
+template<typename T>
+inline typename util::enable_if<std::is_floating_point<T>::value>::type
+InternalNode<ChildT>::signedFloodFill(T outside)
+{
+    const uint32_t first = *mChildMask.beginOn();
+    if (first < NUM_VALUES) {
+        bool xInside = mTable[first].child->getFirstValue() < 0;
+        bool yInside = xInside, zInside = xInside;
+        for (uint32_t x = 0; x != (1 << LOG2DIM); ++x) {
+            const uint32_t x00 = x << (2 * LOG2DIM); // offset for block(x, 0, 0)
+            if (mChildMask.isOn(x00)) {
+                xInside = mTable[x00].child->getLastValue() < 0;
+            }
+            yInside = xInside;
+            for (uint32_t y = 0; y != (1u << LOG2DIM); ++y) {
+                const uint32_t xy0 = x00 + (y << LOG2DIM); // offset for block(x, y, 0)
+                if (mChildMask.isOn(xy0))
+                    yInside = mTable[xy0].child->getLastValue() < 0;
+                zInside = yInside;
+                for (uint32_t z = 0; z != (1 << LOG2DIM); ++z) {
+                    const uint32_t xyz = xy0 + z; // offset for block(x, y, z)
+                    if (mChildMask.isOn(xyz)) {
+                        zInside = mTable[xyz].child->getLastValue() < 0;
+                    } else {
+                        mTable[xyz].value = zInside ? -outside : outside;
+                    }
+                }
+            }
+        }
+    }
+} // tools::build::InternalNode::signedFloodFill
+
+// ----------------------------> LeafNode <--------------------------------------
+
+template<typename BuildT>
+struct LeafNode
+{
+    using BuildType = BuildT;
+    using ValueType = typename BuildToValueMap<BuildT>::type;
+    using LeafNodeType = LeafNode<BuildT>;
+    static constexpr uint32_t LOG2DIM = 3;
+    static constexpr uint32_t TOTAL = LOG2DIM; // needed by parent nodes
+    static constexpr uint32_t DIM = 1u << TOTAL;
+    static constexpr uint32_t SIZE = 1u << 3 * LOG2DIM; // total number of voxels represented by this node
+    static constexpr uint32_t MASK = DIM - 1; // mask for bit operations
+    static constexpr uint32_t LEVEL = 0; // level 0 = leaf
+    static constexpr uint64_t NUM_VALUES = uint64_t(1) << (3 * TOTAL); // total voxel count represented by this node
+    using NodeMaskType = Mask<LOG2DIM>;
+    template<bool ON>
+    using MaskIterT = typename Mask<LOG2DIM>::template Iterator<ON>;
+    using NanoLeafT = typename NanoNode<BuildT, 0>::Type;
+
+    Coord         mOrigin;
+    Mask<LOG2DIM> mValueMask;
+    ValueType     mValues[SIZE];
+    union {
+        NanoLeafT *mDstNode;
+        uint64_t   mDstOffset;
+    };
+
+    /// @brief Visits all active values in a leaf node
+    class ValueOnIterator : public MaskIterT<true>
+    {
+        using BaseT = MaskIterT<true>;
+        const LeafNode *mParent;
+    public:
+        ValueOnIterator() : BaseT(), mParent(nullptr) {}
+        ValueOnIterator(const LeafNode* parent) :  BaseT(parent->mValueMask.beginOn()), mParent(parent) {}
+        ValueOnIterator& operator=(const ValueOnIterator&) = default;
+        ValueType operator*() const {NANOVDB_ASSERT(*this); return mParent->mValues[BaseT::pos()];}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(BaseT::pos());}
+    }; // Member class ValueOnIterator
+
+    ValueOnIterator  beginValueOn()       {return ValueOnIterator(this);}
+    ValueOnIterator cbeginValueOn() const {return ValueOnIterator(this);}
+
+    /// @brief Visits all inactive values in a leaf node
+    class ValueOffIterator : public MaskIterT<false>
+    {
+        using BaseT = MaskIterT<false>;
+        const LeafNode *mParent;
+    public:
+        ValueOffIterator() : BaseT(), mParent(nullptr) {}
+        ValueOffIterator(const LeafNode* parent) :  BaseT(parent->mValueMask.beginOff()), mParent(parent) {}
+        ValueOffIterator& operator=(const ValueOffIterator&) = default;
+        ValueType operator*() const {NANOVDB_ASSERT(*this); return mParent->mValues[BaseT::pos()];}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(BaseT::pos());}
+    }; // Member class ValueOffIterator
+
+    ValueOffIterator  beginValueOff()       {return ValueOffIterator(this);}
+    ValueOffIterator cbeginValueOff() const {return ValueOffIterator(this);}
+
+    /// @brief Visits all values in a leaf node, i.e. both active and inactive values
+    class ValueIterator
+    {
+        const LeafNode *mParent;
+        uint32_t mPos;
+    public:
+        ValueIterator() : mParent(nullptr), mPos(1u << 3 * LOG2DIM) {}
+        ValueIterator(const LeafNode* parent) :  mParent(parent), mPos(0) {NANOVDB_ASSERT(parent);}
+        ValueIterator& operator=(const ValueIterator&) = default;
+        ValueType operator*() const { NANOVDB_ASSERT(*this); return mParent->mValues[mPos];}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(mPos);}
+        bool isActive() const { NANOVDB_ASSERT(*this); return mParent->isActive(mPos);}
+        operator bool() const {return mPos < SIZE;}
+        ValueIterator& operator++() {++mPos; return *this;}
+        ValueIterator operator++(int) {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+    }; // Member class ValueIterator
+
+    ValueIterator  beginValue()          {return ValueIterator(this);}
+    ValueIterator cbeginValueAll() const {return ValueIterator(this);}
+
+    LeafNode(const Coord& ijk, const ValueType& value, bool state)
+        : mOrigin(ijk & ~MASK)
+        , mValueMask(state) //invalid
+        , mDstOffset(0)
+    {
+        ValueType*  target = mValues;
+        uint32_t n = SIZE;
+        while (n--) {
+            *target++ = value;
+        }
+    }
+    LeafNode(const LeafNode&) = delete; // disallow copy-construction
+    LeafNode(LeafNode&&) = delete; // disallow move construction
+    LeafNode& operator=(const LeafNode&) = delete; // disallow copy assignment
+    LeafNode& operator=(LeafNode&&) = delete; // disallow move assignment
+    ~LeafNode() = default;
+
+    const Mask<LOG2DIM>& getValueMask() const {return mValueMask;}
+    const Mask<LOG2DIM>& valueMask() const {return mValueMask;}
+    const Coord& origin() const {return mOrigin;}
+
+    /// @brief Return the linear offset corresponding to the given coordinate
+    static uint32_t CoordToOffset(const Coord& ijk)
+    {
+        return ((ijk[0] & int32_t(MASK)) << (2 * LOG2DIM)) +
+               ((ijk[1] & int32_t(MASK)) << LOG2DIM) +
+                (ijk[2] & int32_t(MASK));
+    }
+
+    static Coord OffsetToLocalCoord(uint32_t n)
+    {
+        NANOVDB_ASSERT(n < SIZE);
+        const int32_t m = n & ((1 << 2 * LOG2DIM) - 1);
+        return Coord(n >> 2 * LOG2DIM, m >> LOG2DIM, m & int32_t(MASK));
+    }
+
+    void localToGlobalCoord(Coord& ijk) const
+    {
+        ijk += mOrigin;
+    }
+
+    Coord offsetToGlobalCoord(uint32_t n) const
+    {
+        Coord ijk = LeafNode::OffsetToLocalCoord(n);
+        this->localToGlobalCoord(ijk);
+        return ijk;
+    }
+
+    ValueType getFirstValue() const { return mValues[0]; }
+    ValueType getLastValue() const { return mValues[SIZE - 1]; }
+    const ValueType& getValue(uint32_t i) const {return mValues[i];}
+    const ValueType& getValue(const Coord& ijk) const {return mValues[CoordToOffset(ijk)];}
+
+    template<typename OpT, typename... ArgsT>
+    auto get(const Coord& ijk, ArgsT&&... args) const {return OpT::get(*this, CoordToOffset(ijk), args...);}
+
+    template<typename OpT, typename... ArgsT>
+    auto set(const Coord& ijk, ArgsT&&... args) {return OpT::set(*this, CoordToOffset(ijk), args...);}
+
+#ifndef NANOVDB_NEW_ACCESSOR_METHODS
+    template<typename AccT>
+    const ValueType& getValueAndCache(const Coord& ijk, const AccT&) const
+    {
+        return mValues[CoordToOffset(ijk)];
+    }
+
+    template<typename AccT>
+    void setValueAndCache(const Coord& ijk, const ValueType& value, const AccT&)
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        mValueMask.setOn(n);
+        mValues[n] = value;
+    }
+
+    template<typename AccT>
+    void setValueOnAndCache(const Coord& ijk, const AccT&)
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        mValueMask.setOn(n);
+    }
+
+    template<typename AccT>
+    bool isActiveAndCache(const Coord& ijk, const AccT&) const
+    {
+        return mValueMask.isOn(CoordToOffset(ijk));
+    }
+#endif
+
+    void setValue(uint32_t n, const ValueType& value)
+    {
+        mValueMask.setOn(n);
+        mValues[n] = value;
+    }
+    void setValue(const Coord& ijk, const ValueType& value){this->setValue(CoordToOffset(ijk), value);}
+
+    void merge(LeafNode &other)
+    {
+        other.mValueMask -= mValueMask;
+        for (auto iter = other.mValueMask.beginOn(); iter; ++iter) {
+            const uint32_t n = *iter;
+            mValues[n] = other.mValues[n];
+        }
+        mValueMask |= other.mValueMask;
+    }
+
+    template<typename T>
+    typename util::enable_if<std::is_floating_point<T>::value>::type
+    signedFloodFill(T outside);
+
+}; // tools::build::LeafNode<T>
+
+//================================================================================================
+
+template <>
+struct LeafNode<ValueMask>
+{
+    using ValueType = bool;
+    using BuildType = ValueMask;
+    using LeafNodeType = LeafNode<BuildType>;
+    static constexpr uint32_t LOG2DIM = 3;
+    static constexpr uint32_t TOTAL = LOG2DIM; // needed by parent nodes
+    static constexpr uint32_t DIM = 1u << TOTAL;
+    static constexpr uint32_t SIZE = 1u << 3 * LOG2DIM; // total number of voxels represented by this node
+    static constexpr uint32_t MASK = DIM - 1; // mask for bit operations
+    static constexpr uint32_t LEVEL = 0; // level 0 = leaf
+    static constexpr uint64_t NUM_VALUES = uint64_t(1) << (3 * TOTAL); // total voxel count represented by this node
+    using NodeMaskType = Mask<LOG2DIM>;
+    template<bool ON>
+    using MaskIterT = typename Mask<LOG2DIM>::template Iterator<ON>;
+    using NanoLeafT = typename NanoNode<BuildType, 0>::Type;
+
+    Coord         mOrigin;
+    Mask<LOG2DIM> mValueMask;
+    union {
+        NanoLeafT *mDstNode;
+        uint64_t   mDstOffset;
+    };
+
+    /// @brief Visits all active values in a leaf node
+    class ValueOnIterator : public MaskIterT<true>
+    {
+        using BaseT = MaskIterT<true>;
+        const LeafNode *mParent;
+    public:
+        ValueOnIterator() : BaseT(), mParent(nullptr) {}
+        ValueOnIterator(const LeafNode* parent) :  BaseT(parent->mValueMask.beginOn()), mParent(parent) {}
+        ValueOnIterator& operator=(const ValueOnIterator&) = default;
+        bool operator*() const {NANOVDB_ASSERT(*this); return true;}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(BaseT::pos());}
+    }; // Member class ValueOnIterator
+
+    ValueOnIterator  beginValueOn()       {return ValueOnIterator(this);}
+    ValueOnIterator cbeginValueOn() const {return ValueOnIterator(this);}
+
+    /// @brief Visits all inactive values in a leaf node
+    class ValueOffIterator : public MaskIterT<false>
+    {
+        using BaseT = MaskIterT<false>;
+        const LeafNode *mParent;
+    public:
+        ValueOffIterator() : BaseT(), mParent(nullptr) {}
+        ValueOffIterator(const LeafNode* parent) :  BaseT(parent->mValueMask.beginOff()), mParent(parent) {}
+        ValueOffIterator& operator=(const ValueOffIterator&) = default;
+        bool operator*() const {NANOVDB_ASSERT(*this); return false;}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(BaseT::pos());}
+    }; // Member class ValueOffIterator
+
+    ValueOffIterator  beginValueOff()       {return ValueOffIterator(this);}
+    ValueOffIterator cbeginValueOff() const {return ValueOffIterator(this);}
+
+    /// @brief Visits all values in a leaf node, i.e. both active and inactive values
+    class ValueIterator
+    {
+        const LeafNode *mParent;
+        uint32_t mPos;
+    public:
+        ValueIterator() : mParent(nullptr), mPos(1u << 3 * LOG2DIM) {}
+        ValueIterator(const LeafNode* parent) :  mParent(parent), mPos(0) {NANOVDB_ASSERT(parent);}
+        ValueIterator& operator=(const ValueIterator&) = default;
+        bool operator*() const { NANOVDB_ASSERT(*this); return mParent->mValueMask.isOn(mPos);}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(mPos);}
+        bool isActive() const { NANOVDB_ASSERT(*this); return mParent->mValueMask.isOn(mPos);}
+        operator bool() const {return mPos < SIZE;}
+        ValueIterator& operator++() {++mPos; return *this;}
+        ValueIterator operator++(int) {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+    }; // Member class ValueIterator
+
+    ValueIterator  beginValue()          {return ValueIterator(this);}
+    ValueIterator cbeginValueAll() const {return ValueIterator(this);}
+
+    LeafNode(const Coord& ijk, const ValueType&, bool state)
+        : mOrigin(ijk & ~MASK)
+        , mValueMask(state) //invalid
+        , mDstOffset(0)
+    {
+    }
+    LeafNode(const LeafNode&) = delete; // disallow copy-construction
+    LeafNode(LeafNode&&) = delete; // disallow move construction
+    LeafNode& operator=(const LeafNode&) = delete; // disallow copy assignment
+    LeafNode& operator=(LeafNode&&) = delete; // disallow move assignment
+    ~LeafNode() = default;
+
+    const Mask<LOG2DIM>& valueMask() const {return mValueMask;}
+    const Mask<LOG2DIM>& getValueMask() const {return mValueMask;}
+    const Coord& origin() const {return mOrigin;}
+
+    /// @brief Return the linear offset corresponding to the given coordinate
+    static uint32_t CoordToOffset(const Coord& ijk)
+    {
+        return ((ijk[0] & int32_t(MASK)) << (2 * LOG2DIM)) +
+               ((ijk[1] & int32_t(MASK)) <<       LOG2DIM) +
+                (ijk[2] & int32_t(MASK));
+    }
+
+    static Coord OffsetToLocalCoord(uint32_t n)
+    {
+        NANOVDB_ASSERT(n < SIZE);
+        const int32_t m = n & ((1 << 2 * LOG2DIM) - 1);
+        return Coord(n >> 2 * LOG2DIM, m >> LOG2DIM, m & int32_t(MASK));
+    }
+
+    void localToGlobalCoord(Coord& ijk) const {ijk += mOrigin;}
+
+    Coord offsetToGlobalCoord(uint32_t n) const
+    {
+        Coord ijk = LeafNode::OffsetToLocalCoord(n);
+        this->localToGlobalCoord(ijk);
+        return ijk;
+    }
+
+    bool getFirstValue() const { return mValueMask.isOn(0); }
+    bool getLastValue() const { return mValueMask.isOn(SIZE - 1); }
+    bool getValue(uint32_t i) const {return mValueMask.isOn(i);}
+    bool getValue(const Coord& ijk) const {return mValueMask.isOn(CoordToOffset(ijk));}
+
+    template<typename OpT, typename... ArgsT>
+    auto get(const Coord& ijk, ArgsT&&... args) const {return OpT::get(*this, CoordToOffset(ijk), args...);}
+
+    template<typename OpT, typename... ArgsT>
+    auto set(const Coord& ijk, ArgsT&&... args) {return OpT::set(*this, CoordToOffset(ijk), args...);}
+
+#ifndef NANOVDB_NEW_ACCESSOR_METHODS
+    template<typename AccT>
+    bool getValueAndCache(const Coord& ijk, const AccT&) const
+    {
+        return mValueMask.isOn(CoordToOffset(ijk));
+    }
+
+    template<typename AccT>
+    void setValueAndCache(const Coord& ijk, bool, const AccT&)
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        mValueMask.setOn(n);
+    }
+
+    template<typename AccT>
+    void setValueOnAndCache(const Coord& ijk, const AccT&)
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        mValueMask.setOn(n);
+    }
+
+    template<typename AccT>
+    bool isActiveAndCache(const Coord& ijk, const AccT&) const
+    {
+        return mValueMask.isOn(CoordToOffset(ijk));
+    }
+#endif
+
+    void setValue(uint32_t n, bool) {mValueMask.setOn(n);}
+    void setValue(const Coord& ijk) {mValueMask.setOn(CoordToOffset(ijk));}
+
+    void merge(LeafNode &other)
+    {
+        mValueMask |= other.mValueMask;
+    }
+
+}; // tools::build::LeafNode<ValueMask>
+
+//================================================================================================
+
+template <>
+struct LeafNode<bool>
+{
+    using ValueType = bool;
+    using BuildType = ValueMask;
+    using LeafNodeType = LeafNode<BuildType>;
+    static constexpr uint32_t LOG2DIM = 3;
+    static constexpr uint32_t TOTAL = LOG2DIM; // needed by parent nodes
+    static constexpr uint32_t DIM = 1u << TOTAL;
+    static constexpr uint32_t SIZE = 1u << 3 * LOG2DIM; // total number of voxels represented by this node
+    static constexpr uint32_t MASK = DIM - 1; // mask for bit operations
+    static constexpr uint32_t LEVEL = 0; // level 0 = leaf
+    static constexpr uint64_t NUM_VALUES = uint64_t(1) << (3 * TOTAL); // total voxel count represented by this node
+    using NodeMaskType = Mask<LOG2DIM>;
+    template<bool ON>
+    using MaskIterT = typename Mask<LOG2DIM>::template Iterator<ON>;
+    using NanoLeafT = typename NanoNode<BuildType, 0>::Type;
+
+    Coord         mOrigin;
+    Mask<LOG2DIM> mValueMask, mValues;
+    union {
+        NanoLeafT *mDstNode;
+        uint64_t   mDstOffset;
+    };
+
+    /// @brief Visits all active values in a leaf node
+    class ValueOnIterator : public MaskIterT<true>
+    {
+        using BaseT = MaskIterT<true>;
+        const LeafNode *mParent;
+    public:
+        ValueOnIterator() : BaseT(), mParent(nullptr) {}
+        ValueOnIterator(const LeafNode* parent) :  BaseT(parent->mValueMask.beginOn()), mParent(parent) {}
+        ValueOnIterator& operator=(const ValueOnIterator&) = default;
+        bool operator*() const {NANOVDB_ASSERT(*this); return mParent->mValues.isOn(BaseT::pos());}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(BaseT::pos());}
+    }; // Member class ValueOnIterator
+
+    ValueOnIterator  beginValueOn()       {return ValueOnIterator(this);}
+    ValueOnIterator cbeginValueOn() const {return ValueOnIterator(this);}
+
+    /// @brief Visits all inactive values in a leaf node
+    class ValueOffIterator : public MaskIterT<false>
+    {
+        using BaseT = MaskIterT<false>;
+        const LeafNode *mParent;
+    public:
+        ValueOffIterator() : BaseT(), mParent(nullptr) {}
+        ValueOffIterator(const LeafNode* parent) :  BaseT(parent->mValueMask.beginOff()), mParent(parent) {}
+        ValueOffIterator& operator=(const ValueOffIterator&) = default;
+        bool operator*() const {NANOVDB_ASSERT(*this); return mParent->mValues.isOn(BaseT::pos());}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(BaseT::pos());}
+    }; // Member class ValueOffIterator
+
+    ValueOffIterator  beginValueOff()       {return ValueOffIterator(this);}
+    ValueOffIterator cbeginValueOff() const {return ValueOffIterator(this);}
+
+    /// @brief Visits all values in a leaf node, i.e. both active and inactive values
+    class ValueIterator
+    {
+        const LeafNode *mParent;
+        uint32_t mPos;
+    public:
+        ValueIterator() : mParent(nullptr), mPos(1u << 3 * LOG2DIM) {}
+        ValueIterator(const LeafNode* parent) :  mParent(parent), mPos(0) {NANOVDB_ASSERT(parent);}
+        ValueIterator& operator=(const ValueIterator&) = default;
+        bool operator*() const { NANOVDB_ASSERT(*this); return mParent->mValues.isOn(mPos);}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(mPos);}
+        bool isActive() const { NANOVDB_ASSERT(*this); return mParent->mValueMask.isOn(mPos);}
+        operator bool() const {return mPos < SIZE;}
+        ValueIterator& operator++() {++mPos; return *this;}
+        ValueIterator operator++(int) {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+    }; // Member class ValueIterator
+
+    ValueIterator beginValue()           {return ValueIterator(this);}
+    ValueIterator cbeginValueAll() const {return ValueIterator(this);}
+
+    LeafNode(const Coord& ijk, bool value, bool state)
+        : mOrigin(ijk & ~MASK)
+        , mValueMask(state)
+        , mValues(value)
+        , mDstOffset(0)
+    {
+    }
+    LeafNode(const LeafNode&) = delete; // disallow copy-construction
+    LeafNode(LeafNode&&) = delete; // disallow move construction
+    LeafNode& operator=(const LeafNode&) = delete; // disallow copy assignment
+    LeafNode& operator=(LeafNode&&) = delete; // disallow move assignment
+    ~LeafNode() = default;
+
+    const Mask<LOG2DIM>& valueMask() const {return mValueMask;}
+    const Mask<LOG2DIM>& getValueMask() const {return mValueMask;}
+    const Coord& origin() const {return mOrigin;}
+
+    /// @brief Return the linear offset corresponding to the given coordinate
+    static uint32_t CoordToOffset(const Coord& ijk)
+    {
+        return ((ijk[0] & int32_t(MASK)) << (2 * LOG2DIM)) +
+               ((ijk[1] & int32_t(MASK)) << LOG2DIM) +
+                (ijk[2] & int32_t(MASK));
+    }
+
+    static Coord OffsetToLocalCoord(uint32_t n)
+    {
+        NANOVDB_ASSERT(n < SIZE);
+        const int32_t m = n & ((1 << 2 * LOG2DIM) - 1);
+        return Coord(n >> 2 * LOG2DIM, m >> LOG2DIM, m & int32_t(MASK));
+    }
+
+    void localToGlobalCoord(Coord& ijk) const
+    {
+        ijk += mOrigin;
+    }
+
+    Coord offsetToGlobalCoord(uint32_t n) const
+    {
+        Coord ijk = LeafNode::OffsetToLocalCoord(n);
+        this->localToGlobalCoord(ijk);
+        return ijk;
+    }
+    bool getFirstValue() const { return mValues.isOn(0); }
+    bool getLastValue() const { return mValues.isOn(SIZE - 1); }
+
+    bool getValue(uint32_t i) const {return mValues.isOn(i);}
+    bool getValue(const Coord& ijk) const
+    {
+        return mValues.isOn(CoordToOffset(ijk));
+    }
+#ifndef NANOVDB_NEW_ACCESSOR_METHODS
+    template<typename AccT>
+    bool isActiveAndCache(const Coord& ijk, const AccT&) const
+    {
+        return mValueMask.isOn(CoordToOffset(ijk));
+    }
+
+    template<typename AccT>
+    bool getValueAndCache(const Coord& ijk, const AccT&) const
+    {
+        return mValues.isOn(CoordToOffset(ijk));
+    }
+
+    template<typename AccT>
+    void setValueAndCache(const Coord& ijk, bool value, const AccT&)
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        mValueMask.setOn(n);
+        mValues.setOn(n);
+    }
+
+    template<typename AccT>
+    void setValueOnAndCache(const Coord& ijk, const AccT&)
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        mValueMask.setOn(n);
+    }
+#endif
+
+    void setValue(uint32_t n, bool value)
+    {
+        mValueMask.setOn(n);
+        mValues.set(n, value);
+    }
+    void setValue(const Coord& ijk, bool value) {return this->setValue(CoordToOffset(ijk), value);}
+
+    void merge(LeafNode &other)
+    {
+        mValues |= other.mValues;
+        mValueMask |= other.mValueMask;
+    }
+
+}; // tools::build::LeafNode<bool>
+
+//================================================================================================
+
+template<typename BuildT>
+template<typename T>
+inline typename util::enable_if<std::is_floating_point<T>::value>::type
+LeafNode<BuildT>::signedFloodFill(T outside)
+{
+    const uint32_t first = *mValueMask.beginOn();
+    if (first < SIZE) {
+        bool xInside = mValues[first] < 0, yInside = xInside, zInside = xInside;
+        for (uint32_t x = 0; x != DIM; ++x) {
+            const uint32_t x00 = x << (2 * LOG2DIM);
+            if (mValueMask.isOn(x00))
+                xInside = mValues[x00] < 0; // element(x, 0, 0)
+            yInside = xInside;
+            for (uint32_t y = 0; y != DIM; ++y) {
+                const uint32_t xy0 = x00 + (y << LOG2DIM);
+                if (mValueMask.isOn(xy0))
+                    yInside = mValues[xy0] < 0; // element(x, y, 0)
+                zInside = yInside;
+                for (uint32_t z = 0; z != (1 << LOG2DIM); ++z) {
+                    const uint32_t xyz = xy0 + z; // element(x, y, z)
+                    if (mValueMask.isOn(xyz)) {
+                        zInside = mValues[xyz] < 0;
+                    } else {
+                        mValues[xyz] = zInside ? -outside : outside;
+                    }
+                }
+            }
+        }
+    }
+} // tools::build::LeafNode<T>::signedFloodFill
+
+// ----------------------------> ValueAccessor <--------------------------------------
+
+template<typename BuildT>
+struct ValueAccessor
+{
+    using ValueType = typename BuildToValueMap<BuildT>::type;
+    using LeafT = LeafNode<BuildT>;
+    using Node1 = InternalNode<LeafT>;
+    using Node2 = InternalNode<Node1>;
+    using RootNodeType = RootNode<Node2>;
+    using LeafNodeType = typename RootNodeType::LeafNodeType;
+
+    ValueAccessor(RootNodeType& root)
+        : mRoot(root)
+        , mKeys{Coord(math::Maximum<int>::value()), Coord(math::Maximum<int>::value()), Coord(math::Maximum<int>::value())}
+        , mNode{nullptr, nullptr, nullptr}
+    {
+    }
+    ValueAccessor(ValueAccessor&&) = default; // allow move construction
+    ValueAccessor(const ValueAccessor&) = delete; // disallow copy construction
+    ValueType getValue(int i, int j, int k) const {return this->getValue(Coord(i,j,k));}
+    template<typename NodeT>
+    bool isCached(const Coord& ijk) const
+    {
+        return (ijk[0] & int32_t(~NodeT::MASK)) == mKeys[NodeT::LEVEL][0] &&
+               (ijk[1] & int32_t(~NodeT::MASK)) == mKeys[NodeT::LEVEL][1] &&
+               (ijk[2] & int32_t(~NodeT::MASK)) == mKeys[NodeT::LEVEL][2];
+    }
+
+    template <typename OpT, typename... ArgsT>
+    auto get(const Coord& ijk, ArgsT&&... args) const
+    {
+        if (this->template isCached<LeafT>(ijk)) {
+            return ((const LeafT*)mNode[0])->template get<OpT>(ijk, args...);
+        } else if (this->template isCached<Node1>(ijk)) {
+            return ((const Node1*)mNode[1])->template getAndCache<OpT>(ijk, *this, args...);
+        } else if (this->template isCached<Node2>(ijk)) {
+            return ((const Node2*)mNode[2])->template getAndCache<OpT>(ijk, *this, args...);
+        }
+        return mRoot.template getAndCache<OpT>(ijk, *this, args...);
+    }
+
+    template <typename OpT, typename... ArgsT>
+    auto set(const Coord& ijk, ArgsT&&... args) const
+    {
+        if (this->template isCached<LeafT>(ijk)) {
+            return ((LeafT*)mNode[0])->template set<OpT>(ijk, args...);
+        } else if (this->template isCached<Node1>(ijk)) {
+            return ((Node1*)mNode[1])->template setAndCache<OpT>(ijk, *this, args...);
+        } else if (this->template isCached<Node2>(ijk)) {
+            return ((Node2*)mNode[2])->template setAndCache<OpT>(ijk, *this, args...);
+        }
+        return mRoot.template setAndCache<OpT>(ijk, *this, args...);
+    }
+
+#ifdef NANOVDB_NEW_ACCESSOR_METHODS
+    ValueType getValue(const Coord& ijk) const {return this->template get<GetValue<BuildT>>(ijk);}
+    LeafT* setValue(const Coord& ijk, const ValueType& value) {return this->template set<SetValue<BuildT>>(ijk, value);}
+    LeafT* setValueOn(const Coord& ijk) {return this->template set<SetValue<BuildT>>(ijk);}
+    LeafT& touchLeaf(const Coord& ijk) {return this->template set<TouchLeaf<BuildT>>(ijk);}
+    bool isActive(const Coord& ijk) const {return this->template get<GetState<BuildT>>(ijk);}
+#else
+    ValueType getValue(const Coord& ijk) const
+    {
+        if (this->template isCached<LeafT>(ijk)) {
+            return ((LeafT*)mNode[0])->getValueAndCache(ijk, *this);
+        } else if (this->template isCached<Node1>(ijk)) {
+            return ((Node1*)mNode[1])->getValueAndCache(ijk, *this);
+        } else if (this->template isCached<Node2>(ijk)) {
+            return ((Node2*)mNode[2])->getValueAndCache(ijk, *this);
+        }
+        return mRoot.getValueAndCache(ijk, *this);
+    }
+
+    /// @brief Sets value in a leaf node and returns it.
+    LeafT* setValue(const Coord& ijk, const ValueType& value)
+    {
+        if (this->template isCached<LeafT>(ijk)) {
+            ((LeafT*)mNode[0])->setValueAndCache(ijk, value, *this);
+        } else if (this->template isCached<Node1>(ijk)) {
+            ((Node1*)mNode[1])->setValueAndCache(ijk, value, *this);
+        } else if (this->template isCached<Node2>(ijk)) {
+            ((Node2*)mNode[2])->setValueAndCache(ijk, value, *this);
+        } else {
+            mRoot.setValueAndCache(ijk, value, *this);
+        }
+        NANOVDB_ASSERT(this->isCached<LeafT>(ijk));
+        return (LeafT*)mNode[0];
+    }
+    void setValueOn(const Coord& ijk)
+    {
+        if (this->template isCached<LeafT>(ijk)) {
+            ((LeafT*)mNode[0])->setValueOnAndCache(ijk, *this);
+        } else if (this->template isCached<Node1>(ijk)) {
+            ((Node1*)mNode[1])->setValueOnAndCache(ijk, *this);
+        } else if (this->template isCached<Node2>(ijk)) {
+            ((Node2*)mNode[2])->setValueOnAndCache(ijk, *this);
+        } else {
+            mRoot.setValueOnAndCache(ijk, *this);
+        }
+    }
+    void touchLeaf(const Coord& ijk) const
+    {
+        if (this->template isCached<LeafT>(ijk)) {
+            return;
+        } else if (this->template isCached<Node1>(ijk)) {
+            ((Node1*)mNode[1])->touchLeafAndCache(ijk, *this);
+        } else if (this->template isCached<Node2>(ijk)) {
+            ((Node2*)mNode[2])->touchLeafAndCache(ijk, *this);
+        } else {
+            mRoot.touchLeafAndCache(ijk, *this);
+        }
+    }
+    bool isActive(const Coord& ijk) const
+    {
+        if (this->template isCached<LeafT>(ijk)) {
+            return ((LeafT*)mNode[0])->isActiveAndCache(ijk, *this);
+        } else if (this->template isCached<Node1>(ijk)) {
+            return ((Node1*)mNode[1])->isActiveAndCache(ijk, *this);
+        } else if (this->template isCached<Node2>(ijk)) {
+            return ((Node2*)mNode[2])->isActiveAndCache(ijk, *this);
+        }
+        return mRoot.isActiveAndCache(ijk, *this);
+    }
+#endif
+
+    bool isValueOn(const Coord& ijk) const { return this->isActive(ijk); }
+    template<typename NodeT>
+    void insert(const Coord& ijk, NodeT* node) const
+    {
+        mKeys[NodeT::LEVEL] = ijk & ~NodeT::MASK;
+        mNode[NodeT::LEVEL] = node;
+    }
+    RootNodeType& mRoot;
+    mutable Coord mKeys[3];
+    mutable void* mNode[3];
+}; // tools::build::ValueAccessor<BuildT>
+
+// ----------------------------> Tree <--------------------------------------
+
+template<typename BuildT>
+struct Tree
+{
+    using ValueType = typename BuildToValueMap<BuildT>::type;
+    using Node0 = LeafNode<BuildT>;
+    using Node1 = InternalNode<Node0>;
+    using Node2 = InternalNode<Node1>;
+    using RootNodeType = RootNode<Node2>;
+    using LeafNodeType = typename RootNodeType::LeafNodeType;
+    struct WriteAccessor;
+
+    RootNodeType  mRoot;
+    std::mutex    mMutex;
+
+    Tree(const ValueType &background) : mRoot(background) {}
+    Tree(const Tree&) = delete; // disallow copy construction
+    Tree(Tree&&) = delete; // disallow move construction
+    Tree& tree() {return *this;}
+    RootNodeType& root() {return mRoot;}
+    ValueType getValue(const Coord& ijk) const {return mRoot.getValue(ijk);}
+    ValueType getValue(int i, int j, int k) const {return this->getValue(Coord(i,j,k));}
+    void setValue(const Coord& ijk, const ValueType &value) {mRoot.setValue(ijk, value);}
+    std::array<size_t,3> nodeCount() const
+    {
+        std::array<size_t, 3> count{0,0,0};
+        mRoot.nodeCount(count);
+        return count;
+    }
+    /// @brief regular accessor for thread-safe reading and non-thread-safe writing
+    ValueAccessor<BuildT> getAccessor() { return ValueAccessor<BuildT>(mRoot); }
+    /// @brief special accessor for thread-safe writing only
+    WriteAccessor getWriteAccessor() { return WriteAccessor(mRoot, mMutex); }
+};// tools::build::Tree<BuildT>
+
+// ----------------------------> Tree::WriteAccessor <--------------------------------------
+
+template<typename BuildT>
+struct Tree<BuildT>::WriteAccessor
+{
+    using AccT   = ValueAccessor<BuildT>;
+    using ValueType = typename AccT::ValueType;
+    using LeafT  = typename AccT::LeafT;
+    using Node1  = typename AccT::Node1;
+    using Node2  = typename AccT::Node2;
+    using RootNodeType  = typename AccT::RootNodeType;
+
+    WriteAccessor(RootNodeType& parent, std::mutex &mx)
+        : mParent(parent)
+        , mRoot(parent.mBackground)
+        , mAcc(mRoot)
+        , mMutex(mx)
+    {
+    }
+    WriteAccessor(const WriteAccessor&) = delete; // disallow copy construction
+    WriteAccessor(WriteAccessor&&) = default; // allow move construction
+    ~WriteAccessor() { this->merge(); }
+    void merge()
+    {
+        mMutex.lock();
+        mParent.merge(mRoot);
+        mMutex.unlock();
+    }
+    inline void setValueOn(const Coord& ijk) {mAcc.setValueOn(ijk);}
+    inline void setValue(const Coord& ijk, const ValueType &value) {mAcc.setValue(ijk, value);}
+
+    RootNodeType &mParent, mRoot;
+    AccT          mAcc;
+    std::mutex   &mMutex;
+}; // tools::build::Tree<BuildT>::WriteAccessor
+
+// ----------------------------> Grid <--------------------------------------
+
+template<typename BuildT>
+struct Grid : public Tree<BuildT>
+{
+    using BuildType = BuildT;
+    using ValueType = typename BuildToValueMap<BuildT>::type;
+    using TreeType = Tree<BuildT>;
+    using Node0 = LeafNode<BuildT>;
+    using Node1 = InternalNode<Node0>;
+    using Node2 = InternalNode<Node1>;
+    using RootNodeType = RootNode<Node2>;
+
+    GridClass   mGridClass;
+    GridType    mGridType;
+    Map         mMap;
+    std::string mName;
+
+    Grid(const ValueType &background, const std::string &name = "", GridClass gClass = GridClass::Unknown)
+      : TreeType(background)
+      , mGridClass(gClass)
+      , mGridType(toGridType<BuildT>())
+      , mName(name)
+    {
+        mMap.set(1.0, Vec3d(0.0), 1.0);
+    }
+    TreeType& tree() {return *this;}
+    const GridType&  gridType() const { return mGridType; }
+    const GridClass& gridClass() const { return mGridClass; }
+    const Map& map() const { return mMap; }
+    void setTransform(double scale=1.0, const Vec3d &translation = Vec3d(0.0)) {mMap.set(scale, translation, 1.0);}
+    const std::string& gridName() const { return mName; }
+    const std::string& getName() const { return mName; }
+    void setName(const std::string &name) { mName = name; }
+    /// @brief Sets grids values in domain of the @a bbox to those returned by the specified @a func with the
+    ///        expected signature [](const Coord&)->ValueType.
+    ///
+    /// @note If @a func returns a value equal to the background value of the input grid at a
+    ///       specific voxel coordinate, then the active state of that coordinate is off! Else the value
+    ///       value is set and the active state is on. This is done to allow for sparse grids to be generated.
+    ///
+    /// @param func  Functor used to evaluate the grid values in the @a bbox
+    /// @param bbox  Coordinate bounding-box over which the grid values will be set.
+    /// @param delta Specifies a lower threshold value for rendering (optional). Typically equals the voxel size
+    ///              for level sets and otherwise it's zero.
+    template <typename Func>
+    void operator()(const Func& func, const CoordBBox& bbox, ValueType delta = ValueType(0));
+};// tools::build::Grid
+
+template <typename BuildT>
+template <typename Func>
+void Grid<BuildT>::operator()(const Func& func, const CoordBBox& bbox, ValueType delta)
+{
+    auto &root = this->tree().root();
+#if __cplusplus >= 201703L
+    static_assert(util::is_same<ValueType, typename std::invoke_result<Func,const Coord&>::type>::value, "GridBuilder: mismatched ValueType");
+#else// invoke_result was introduced in C++17 and result_of was removed in C++20
+    static_assert(util::is_same<ValueType, typename std::result_of<Func(const Coord&)>::type>::value, "GridBuilder: mismatched ValueType");
+#endif
+    const CoordBBox leafBBox(bbox[0] >> Node0::TOTAL, bbox[1] >> Node0::TOTAL);
+    std::mutex mutex;
+    util::forEach(leafBBox, [&](const CoordBBox& b) {
+        Node0* leaf = nullptr;
+        for (auto it = b.begin(); it; ++it) {
+            Coord min(*it << Node0::TOTAL), max(min + Coord(Node0::DIM - 1));
+            const CoordBBox b(min.maxComponent(bbox.min()),
+                              max.minComponent(bbox.max()));// crop
+            if (leaf == nullptr) {
+                leaf = new Node0(b[0], root.mBackground, false);
+            } else {
+                leaf->mOrigin = b[0] & ~Node0::MASK;
+                NANOVDB_ASSERT(leaf->mValueMask.isOff());
+            }
+            leaf->mDstOffset = 0;// no prune
+            for (auto ijk = b.begin(); ijk; ++ijk) {
+                const auto v = func(*ijk);// call functor
+                if (v != root.mBackground) leaf->setValue(*ijk, v);// don't insert background values
+            }
+            if (!leaf->mValueMask.isOff()) {// has active values
+                if (leaf->mValueMask.isOn()) {// only active values
+                    const auto first = leaf->getFirstValue();
+                    int n=1;
+                    while (n<512) {// 8^3 = 512
+                        if (leaf->mValues[n++] != first) break;
+                    }
+                    if (n == 512) leaf->mDstOffset = 1;// prune below
+                }
+                std::lock_guard<std::mutex> guard(mutex);
+                NANOVDB_ASSERT(leaf != nullptr);
+                root.addNode(leaf);
+                NANOVDB_ASSERT(leaf == nullptr);
+            }
+        }// loop over sub-part of leafBBox
+        if (leaf) delete leaf;
+    });
+
+    // Prune leaf and tile nodes
+    for (auto it2 = root.mTable.begin(); it2 != root.mTable.end(); ++it2) {
+        if (auto *upper = it2->second.child) {//upper level internal node
+            for (auto it1 = upper->mChildMask.beginOn(); it1; ++it1) {
+                auto *lower = upper->mTable[*it1].child;// lower level internal node
+                for (auto it0 = lower->mChildMask.beginOn(); it0; ++it0) {
+                    auto *leaf = lower->mTable[*it0].child;// leaf nodes
+                    if (leaf->mDstOffset) {
+                        lower->mTable[*it0].value = leaf->getFirstValue();
+                        lower->mChildMask.setOff(*it0);
+                        lower->mValueMask.setOn(*it0);
+                        delete leaf;
+                    }
+                }// loop over leaf nodes
+                if (lower->mChildMask.isOff()) {//only tiles
+                    const auto first = lower->getFirstValue();
+                    int n=1;
+                    while (n < 4096) {// 16^3 = 4096
+                        if (lower->mTable[n++].value != first) break;
+                    }
+                    if (n == 4096) {// identical tile values so prune
+                        upper->mTable[*it1].value = first;
+                        upper->mChildMask.setOff(*it1);
+                        upper->mValueMask.setOn(*it1);
+                        delete lower;
+                    }
+                }
+            }// loop over lower internal nodes
+            if (upper->mChildMask.isOff()) {//only tiles
+                const auto first = upper->getFirstValue();
+                int n=1;
+                while (n < 32768) {// 32^3 = 32768
+                    if (upper->mTable[n++].value != first) break;
+                }
+                if (n == 32768) {// identical tile values so prune
+                    it2->second.value = first;
+                    it2->second.state = upper->mValueMask.isOn();
+                    it2->second.child = nullptr;
+                    delete upper;
+                }
+            }
+        }// is child node of the root
+    }// loop over root table
+}// tools::build::Grid::operator()
+
+//================================================================================================
+
+template <typename T>
+using BuildLeaf = LeafNode<T>;
+template <typename T>
+using BuildLower = InternalNode<BuildLeaf<T>>;
+template <typename T>
+using BuildUpper = InternalNode<BuildLower<T>>;
+template <typename T>
+using BuildRoot  = RootNode<BuildUpper<T>>;
+template <typename T>
+using BuildTile  = typename BuildRoot<T>::Tile;
+
+using FloatGrid  = Grid<float>;
+using Fp4Grid    = Grid<Fp4>;
+using Fp8Grid    = Grid<Fp8>;
+using Fp16Grid   = Grid<Fp16>;
+using FpNGrid    = Grid<FpN>;
+using DoubleGrid = Grid<double>;
+using Int32Grid  = Grid<int32_t>;
+using UInt32Grid = Grid<uint32_t>;
+using Int64Grid  = Grid<int64_t>;
+using Vec3fGrid  = Grid<Vec3f>;
+using Vec3dGrid  = Grid<Vec3d>;
+using Vec4fGrid  = Grid<Vec4f>;
+using Vec4dGrid  = Grid<Vec4d>;
+using MaskGrid   = Grid<ValueMask>;
+using IndexGrid  = Grid<ValueIndex>;
+using OnIndexGrid = Grid<ValueOnIndex>;
+using BoolGrid   = Grid<bool>;
+
+// ----------------------------> NodeManager <--------------------------------------
+
+// GridT can be openvdb::Grid and nanovdb::tools::build::Grid
+template <typename GridT>
+class NodeManager
+{
+public:
+
+    using ValueType = typename GridT::ValueType;
+    using BuildType = typename GridT::BuildType;
+    using GridType = GridT;
+    using TreeType = typename GridT::TreeType;
+    using RootNodeType = typename TreeType::RootNodeType;
+    static_assert(RootNodeType::LEVEL == 3, "NodeManager expected LEVEL=3");
+    using Node2 = typename RootNodeType::ChildNodeType;
+    using Node1 = typename Node2::ChildNodeType;
+    using Node0 = typename Node1::ChildNodeType;
+
+    NodeManager(GridT &grid) : mGrid(grid) {this->init();}
+    void init()
+    {
+        mArray0.clear();
+        mArray1.clear();
+        mArray2.clear();
+        auto counts = mGrid.tree().nodeCount();
+        mArray0.reserve(counts[0]);
+        mArray1.reserve(counts[1]);
+        mArray2.reserve(counts[2]);
+
+        for (auto it2 = mGrid.tree().root().cbeginChildOn(); it2; ++it2) {
+            Node2 &upper = const_cast<Node2&>(*it2);
+            mArray2.emplace_back(&upper);
+            for (auto it1 = upper.cbeginChildOn(); it1; ++it1) {
+                Node1 &lower = const_cast<Node1&>(*it1);
+                mArray1.emplace_back(&lower);
+                for (auto it0 = lower.cbeginChildOn(); it0; ++it0) {
+                    Node0 &leaf = const_cast<Node0&>(*it0);
+                    mArray0.emplace_back(&leaf);
+                }// loop over leaf nodes
+            }// loop over lower internal nodes
+        }// loop over root node
+    }
+
+    /// @brief Return the number of tree nodes at the specified level
+    /// @details 0 is leaf, 1 is lower internal, and 2 is upper internal level
+    uint64_t nodeCount(int level) const
+    {
+        NANOVDB_ASSERT(level==0 || level==1 || level==2);
+        return level==0 ? mArray0.size() : level==1 ? mArray1.size() : mArray2.size();
+    }
+
+    template <int LEVEL>
+    typename util::enable_if<LEVEL==0, Node0&>::type node(int i) {return *mArray0[i];}
+    template <int LEVEL>
+    typename util::enable_if<LEVEL==0, const Node0&>::type node(int i) const {return *mArray0[i];}
+    template <int LEVEL>
+    typename util::enable_if<LEVEL==1, Node1&>::type node(int i) {return *mArray1[i];}
+    template <int LEVEL>
+    typename util::enable_if<LEVEL==1, const Node1&>::type node(int i) const {return *mArray1[i];}
+    template <int LEVEL>
+    typename util::enable_if<LEVEL==2, Node2&>::type node(int i) {return *mArray2[i];}
+    template <int LEVEL>
+    typename util::enable_if<LEVEL==2, const Node2&>::type node(int i) const {return *mArray2[i];}
+
+    /// @brief Return the i'th leaf node with respect to breadth-first ordering
+    const Node0& leaf(uint32_t i) const { return *mArray0[i]; }
+    Node0& leaf(uint32_t i) { return *mArray0[i]; }
+    uint64_t leafCount() const {return mArray0.size();}
+
+    /// @brief Return the i'th lower internal node with respect to breadth-first ordering
+    const Node1& lower(uint32_t i) const { return *mArray1[i]; }
+    Node1& lower(uint32_t i) { return *mArray1[i]; }
+    uint64_t lowerCount() const {return mArray1.size();}
+
+    /// @brief Return the i'th upper internal node with respect to breadth-first ordering
+    const Node2& upper(uint32_t i) const { return *mArray2[i]; }
+    Node2& upper(uint32_t i) { return *mArray2[i]; }
+    uint64_t upperCount() const {return mArray2.size();}
+
+    RootNodeType& root() {return mGrid.tree().root();}
+    const RootNodeType& root() const {return mGrid.tree().root();}
+
+    TreeType& tree() {return mGrid.tree();}
+    const TreeType& tree() const {return mGrid.tree();}
+
+    GridType& grid() {return mGrid;}
+    const GridType& grid() const {return mGrid;}
+
+protected:
+
+    GridT                &mGrid;
+    std::vector<Node0*>   mArray0; // leaf nodes
+    std::vector<Node1*>   mArray1; // lower internal nodes
+    std::vector<Node2*>   mArray2; // upper internal nodes
+
+};// NodeManager
+
+template <typename NodeManagerT>
+typename util::enable_if<util::is_floating_point<typename NodeManagerT::ValueType>::value>::type
+sdfToLevelSet(NodeManagerT &mgr)
+{
+    mgr.grid().mGridClass = GridClass::LevelSet;
+    // Note that the bottom-up flood filling is essential
+    const auto outside = mgr.root().mBackground;
+    util::forEach(0, mgr.leafCount(), 8, [&](const util::Range1D& r) {
+        for (auto i = r.begin(); i != r.end(); ++i) mgr.leaf(i).signedFloodFill(outside);
+    });
+    util::forEach(0, mgr.lowerCount(), 1, [&](const util::Range1D& r) {
+        for (auto i = r.begin(); i != r.end(); ++i) mgr.lower(i).signedFloodFill(outside);
+    });
+    util::forEach(0, mgr.upperCount(), 1, [&](const util::Range1D& r) {
+        for (auto i = r.begin(); i != r.end(); ++i) mgr.upper(i).signedFloodFill(outside);
+    });
+    mgr.root().signedFloodFill(outside);
+}// sdfToLevelSet
+
+template <typename NodeManagerT>
+void levelSetToFog(NodeManagerT &mgr, bool rebuild = true)
+{
+    using ValueType = typename NodeManagerT::ValueType;
+    mgr.grid().mGridClass = GridClass::FogVolume;
+    const ValueType d = -mgr.root().mBackground, w = 1.0f / d;
+    //std::atomic_bool prune{false};
+    std::atomic<bool> prune{false};
+    auto op = [&](ValueType& v) -> bool {
+        if (v > ValueType(0)) {
+            v = ValueType(0);
+            return false;
+        }
+        v = v > d ? v * w : ValueType(1);
+        return true;
+    };
+    util::forEach(0, mgr.leafCount(), 8, [&](const util::Range1D& r) {
+        for (auto i = r.begin(); i != r.end(); ++i) {
+            auto& leaf = mgr.leaf(i);
+            for (uint32_t i = 0; i < 512u; ++i) leaf.mValueMask.set(i, op(leaf.mValues[i]));
+        }
+    });
+    util::forEach(0, mgr.lowerCount(), 1, [&](const util::Range1D& r) {
+        for (auto i = r.begin(); i != r.end(); ++i) {
+            auto& node = mgr.lower(i);
+            for (uint32_t i = 0; i < 4096u; ++i) {
+                if (node.mChildMask.isOn(i)) {
+                    auto* leaf = node.mTable[i].child;
+                    if (leaf->mValueMask.isOff()) {// prune leaf node
+                        node.mTable[i].value = leaf->getFirstValue();
+                        node.mChildMask.setOff(i);
+                        delete leaf;
+                        prune = true;
+                    }
+                } else {
+                    node.mValueMask.set(i, op(node.mTable[i].value));
+                }
+            }
+        }
+    });
+    util::forEach(0, mgr.upperCount(), 1, [&](const util::Range1D& r) {
+        for (auto i = r.begin(); i != r.end(); ++i) {
+            auto& node = mgr.upper(i);
+            for (uint32_t i = 0; i < 32768u; ++i) {
+                if (node.mChildMask.isOn(i)) {// prune lower internal node
+                    auto* child = node.mTable[i].child;
+                    if (child->mChildMask.isOff() && child->mValueMask.isOff()) {
+                        node.mTable[i].value = child->getFirstValue();
+                        node.mChildMask.setOff(i);
+                        delete child;
+                        prune = true;
+                    }
+                } else {
+                    node.mValueMask.set(i, op(node.mTable[i].value));
+                }
+            }
+        }
+    });
+
+    for (auto it = mgr.root().mTable.begin(); it != mgr.root().mTable.end(); ++it) {
+        auto* child = it->second.child;
+        if (child == nullptr) {
+            it->second.state = op(it->second.value);
+        } else if (child->mChildMask.isOff() && child->mValueMask.isOff()) {
+            it->second.value = child->getFirstValue();
+            it->second.state = false;
+            it->second.child = nullptr;
+            delete child;
+            prune = true;
+        }
+    }
+    if (rebuild && prune) mgr.init();
+}// levelSetToFog
+
+// ----------------------------> Implementations of random access methods <--------------------------------------
+
+template <typename T>
+struct TouchLeaf {
+    static BuildLeaf<T>& set(BuildLeaf<T> &leaf, uint32_t)  {return leaf;}
+};// TouchLeaf<BuildT>
+
+/// @brief Implements Tree::getValue(Coord), i.e. return the value associated with a specific coordinate @c ijk.
+/// @tparam BuildT Build type of the grid being called
+/// @details The value at a coordinate maps to the background, a tile value or a leaf value.
+template <typename T>
+struct GetValue {
+    static auto get(const BuildRoot<T>  &root) {return root.mBackground;}
+    static auto get(const BuildTile<T>  &tile) {return tile.value;}
+    static auto get(const BuildUpper<T> &node, uint32_t n) {return node.mTable[n].value;}
+    static auto get(const BuildLower<T> &node, uint32_t n) {return node.mTable[n].value;}
+    static auto get(const BuildLeaf<T>  &leaf, uint32_t n) {return leaf.getValue(n);}
+};// GetValue<T>
+
+/// @brief Implements Tree::isActive(Coord)
+/// @tparam T Build type of the grid being called
+template <typename T>
+struct GetState {
+    static bool get(const BuildRoot<T>&) {return false;}
+    static bool get(const BuildTile<T>  &tile) {return tile.state;}
+    static bool get(const BuildUpper<T> &node, uint32_t n) {return node.mValueMask.isOn(n);}
+    static bool get(const BuildLower<T> &node, uint32_t n) {return node.mValueMask.isOn(n);}
+    static bool get(const BuildLeaf<T>  &leaf, uint32_t n) {return leaf.mValueMask.isOn(n);}
+};// GetState<T>
+
+/// @brief Set the value and its state at the leaf level mapped to by ijk, and create the leaf node and branch if needed.
+/// @tparam T BuildType of the corresponding tree
+template <typename T>
+struct SetValue {
+    static BuildLeaf<T>* set(BuildLeaf<T> &leaf, uint32_t n) {
+        leaf.mValueMask.setOn(n);// always set the active bit
+        return &leaf;
+    }
+    static BuildLeaf<T>* set(BuildLeaf<T> &leaf, uint32_t n, const typename BuildLeaf<T>::ValueType &v) {
+        leaf.setValue(n, v);
+        return &leaf;
+    }
+};// SetValue<T>
+
+/// @brief Implements Tree::probeLeaf(Coord)
+/// @tparam T Build type of the grid being called
+template <typename T>
+struct ProbeValue {
+    using ValueT = typename BuildLeaf<T>::ValueType;
+    static bool get(const BuildRoot<T>  &root, ValueT &v) {
+        v = root.mBackground;
+        return false;
+    }
+    static bool get(const BuildTile<T> &tile, ValueT &v) {
+        v = tile.value;
+        return tile.state;
+    }
+    static bool get(const BuildUpper<T> &node, uint32_t n, ValueT &v) {
+        v = node.mTable[n].value;
+        return node.mValueMask.isOn(n);
+    }
+    static bool get(const BuildLower<T> &node, uint32_t n, ValueT &v) {
+        v = node.mTable[n].value;
+        return node.mValueMask.isOn(n);
+    }
+    static bool get(const BuildLeaf<T>  &leaf, uint32_t n, ValueT &v) {
+        v = leaf.getValue(n);
+        return leaf.isActive(n);
+    }
+};// ProbeValue<T>
+
+} // namespace tools::build
+
+} // namespace nanovdb
+
+#endif // NANOVDB_TOOLS_BUILD_GRIDBUILDER_H_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/tools/GridChecksum.h b/nanovdb/nanovdb/tools/GridChecksum.h
new file mode 100644
index 0000000000..882ab9222a
--- /dev/null
+++ b/nanovdb/nanovdb/tools/GridChecksum.h
@@ -0,0 +1,427 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    \file nanovdb/tools/GridChecksum.h
+
+    \author Ken Museth
+
+    \brief Computes a pair of uint32_t checksums, of a Grid, by means of 32 bit Cyclic Redundancy Check (CRC32)
+
+    \details A CRC32 is the 32 bit remainder, or residue, of binary division of a message, by a polynomial.
+
+
+    \note before v32.6.0: checksum[0] = Grid+Tree+Root, checksum[1] = nodes
+          after  v32.6.0: checksum[0] = Grid+Tree,      checksum[1] = nodes + blind data in 4K blocks
+
+    When serialized:
+                                [Grid,Tree][Root][ROOT TILES...][Node<5>...][Node<4>...][Leaf<3>...][BlindMeta...][BlindData...]
+    checksum[2] before v32.6.0: <------------- [0] ------------><-------------- [1] --------------->
+    checksum[2] after  v32.6.0: <---[0]---><----------------------------------------[1]---------------------------------------->
+*/
+
+#ifndef NANOVDB_TOOLS_GRIDCHECKSUM_H_HAS_BEEN_INCLUDED
+#define NANOVDB_TOOLS_GRIDCHECKSUM_H_HAS_BEEN_INCLUDED
+
+#include <algorithm>// for std::generate
+#include <array>
+#include <vector>
+#include <cstdint>
+#include <cstddef>// offsetof macro
+#include <numeric>
+#include <type_traits>
+#include <memory>// for std::unique_ptr
+
+#include <nanovdb/NanoVDB.h>
+#include <nanovdb/util/ForEach.h>
+#include <nanovdb/NodeManager.h>
+
+// Define log of block size for FULL CRC32 computation.
+// A value of 12 corresponds to a block size of 4KB (2^12 = 4096).
+#define NANOVDB_CRC32_LOG2_BLOCK_SIZE 12
+
+namespace nanovdb {// ==================================================================
+
+namespace tools {// ====================================================================
+
+/// @brief Compute the (2 x CRC32) checksum of the specified @c gridData
+/// @param gridData  Base pointer to the grid from which the checksum is computed.
+/// @param mode Defines the mode of computation for the checksum.
+/// @return Return the (2 x CRC32) checksum of the specified @c gridData
+Checksum evalChecksum(const GridData *gridData, CheckMode mode = CheckMode::Default);
+
+/// @brief Extract the checksum of a grid
+/// @param gridData Base pointer to grid with a checksum
+/// @return Checksum encoded in the specified grid
+inline Checksum getChecksum(const GridData *gridData)
+{
+    NANOVDB_ASSERT(gridData);
+    return gridData->mChecksum;
+}
+
+/// @brief Return true if the checksum of @c gridData matches the expected
+///        value already encoded into the grid's meta data.
+/// @tparam BuildT Template parameter used to build NanoVDB grid.
+/// @param grid Grid whose checksum is validated.
+/// @param mode Defines the mode of computation for the checksum.
+bool validateChecksum(const GridData *gridData, CheckMode mode = CheckMode::Default);
+
+/// @brief Updates the checksum of a grid
+/// @param grid Grid whose checksum will be updated.
+/// @param mode Defines the mode of computation for the checksum.
+inline void updateChecksum(GridData *gridData, CheckMode mode)
+{
+    NANOVDB_ASSERT(gridData);
+    gridData->mChecksum = evalChecksum(gridData, mode);
+}
+
+/// @brief Updates the checksum of a grid by preserving its mode
+/// @param gridData Base pointer to grid
+inline void updateChecksum(GridData *gridData)
+{
+    updateChecksum(gridData, gridData->mChecksum.mode());
+}
+
+}// namespace tools
+
+namespace util {
+
+/// @brief Initiate single entry in look-up-table for CRC32 computations
+/// @param lut pointer of size 256 for look-up-table
+/// @param n entry in table (assumed n < 256)
+inline __hostdev__ void initCrc32Lut(uint32_t lut[256], uint32_t n)
+{
+    lut[n] = n;
+    uint32_t &cs = lut[n];
+    for (int i = 0; i < 8; ++i) cs = (cs >> 1) ^ ((cs & 1) ? 0xEDB88320 : 0);
+}
+
+/// @brief Initiate entire look-up-table for CRC32 computations
+/// @param lut pointer of size 256 for look-up-table
+inline __hostdev__ void initCrc32Lut(uint32_t lut[256]){for (uint32_t n = 0u; n < 256u; ++n) initCrc32Lut(lut, n);}
+
+/// @brief Create and initiate entire look-up-table for CRC32 computations
+/// @return returns a unique pointer to the lookup table of size 256.
+inline std::unique_ptr<uint32_t[]> createCrc32Lut()
+{
+    std::unique_ptr<uint32_t[]> lut(new uint32_t[256]);
+    initCrc32Lut(lut.get());
+    return lut;
+}
+
+/// @brief Compute crc32 checksum of @c data of @c size bytes (without a lookup table))
+/// @param data pointer to beginning of data
+/// @param size byte size of data
+/// @param crc initial value of crc32 checksum
+/// @return return crc32 checksum of @c data
+inline __hostdev__ uint32_t crc32(const void* data, size_t size, uint32_t crc = 0)
+{
+    NANOVDB_ASSERT(data);
+    crc = ~crc;
+    for (auto *p = (const uint8_t*)data, *q = p + size; p != q; ++p) {
+        crc ^= *p;
+        for (int j = 0; j < 8; ++j) crc = (crc >> 1) ^ (0xEDB88320 & (-(crc & 1)));
+    }
+    return ~crc;
+}
+
+/// @brief Compute crc32 checksum of data between @c begin and @c end
+/// @param begin points to beginning of data
+/// @param end points to end of @data, (exclusive)
+/// @param crc initial value of crc32 checksum
+/// @return return crc32 checksum
+inline __hostdev__ uint32_t crc32(const void *begin, const void *end, uint32_t crc = 0)
+{
+    NANOVDB_ASSERT(begin && end);
+    NANOVDB_ASSERT(end >= begin);
+    return crc32(begin, (const char*)end - (const char*)begin, crc);
+}
+
+/// @brief Compute crc32 checksum of @c data with @c size bytes using a lookup table
+/// @param data pointer to begenning of data
+/// @param size byte size
+/// @param lut pointer to loopup table for accelerated crc32 computation
+/// @param crc initial value of the checksum
+/// @return crc32 checksum of @c data with @c size bytes
+inline __hostdev__ uint32_t crc32(const void *data, size_t size, const uint32_t lut[256], uint32_t crc = 0)
+{
+    NANOVDB_ASSERT(data);
+    crc = ~crc;
+    for (auto *p = (const uint8_t*)data, *q = p + size; p != q; ++p) crc = lut[(crc ^ *p) & 0xFF] ^ (crc >> 8);
+    return ~crc;
+}
+
+/// @brief Compute crc32 checksum of data between @c begin and @c end using a lookup table
+/// @param begin points to beginning of data
+/// @param end points to end of @data, (exclusive)
+/// @param lut pointer to loopup table for accelerated crc32 computation
+/// @param crc initial value of crc32 checksum
+/// @return return crc32 checksum
+inline __hostdev__ uint32_t crc32(const void *begin, const void *end, const uint32_t lut[256], uint32_t crc = 0)
+{
+    NANOVDB_ASSERT(begin && end);
+    NANOVDB_ASSERT(end >= begin);
+    return crc32(begin, (const char*)end - (const char*)begin, lut, crc);
+}// uint32_t util::crc32(const void *begin, const void *end, const uint32_t lut[256], uint32_t crc = 0)
+
+/// @brief
+/// @param data
+/// @param size
+/// @param lut
+/// @return
+inline uint32_t blockedCrc32(const void *data, size_t size, const uint32_t *lut)
+{
+    if (size == 0 ) return ~uint32_t(0);
+    const uint64_t blockCount = size >> NANOVDB_CRC32_LOG2_BLOCK_SIZE;// number of 4 KB (4096 byte) blocks
+    std::unique_ptr<uint32_t[]> checksums(new uint32_t[blockCount]);
+    forEach(0, blockCount, 64, [&](const Range1D &r) {
+        uint32_t blockSize = 1 << NANOVDB_CRC32_LOG2_BLOCK_SIZE, *p = checksums.get() + r.begin();
+        for (auto i = r.begin(); i != r.end(); ++i) {
+            if (i+1 == blockCount) blockSize += static_cast<uint32_t>(size - (blockCount<<NANOVDB_CRC32_LOG2_BLOCK_SIZE));
+            *p++ = crc32((const uint8_t*)data + (i<<NANOVDB_CRC32_LOG2_BLOCK_SIZE), blockSize, lut);
+        }
+    });
+    return crc32(checksums.get(), sizeof(uint32_t)*blockCount, lut);
+}// uint32_t util::blockedCrc32(const void *data, size_t size, const uint32_t *lut)
+
+/// @brief
+/// @param begin
+/// @param end
+/// @param lut
+/// @return
+inline uint32_t blockedCrc32(const void *begin, const void *end, const uint32_t *lut)
+{
+    return blockedCrc32(begin, PtrDiff(end, begin), lut);
+}
+
+}// namespace util =======================================================================================
+
+namespace tools {// ======================================================================================
+
+//    When serialized:
+//                                [Grid,Tree][Root][ROOT TILES...][Node<5>...][Node<4>...][Leaf<3>...][BlindMeta...][BlindData...]
+//    checksum[2] before v32.6.0: <------------- [0] ------------><-------------- [1] --------------->
+//    checksum[]2 after  v32.6.0: <---[0]---><----------------------------------------[1]---------------------------------------->
+
+// ----------------------------> crc32Head <--------------------------------------
+
+/// @brief
+/// @tparam ValueT
+/// @param grid
+/// @param mode
+/// @return
+inline __hostdev__ uint32_t crc32Head(const GridData *gridData, const uint32_t *lut)
+{
+    NANOVDB_ASSERT(gridData);
+    const uint8_t *begin = (const uint8_t*)(gridData), *mid = begin + sizeof(GridData) + sizeof(TreeData);
+    if (gridData->mVersion <= Version(32,6,0)) mid = (const uint8_t*)(gridData->template nodePtr<2>());
+    return util::crc32(begin + 16u, mid, lut);// exclude GridData::mMagic and GridData::mChecksum
+}// uint32_t crc32Head(const GridData *gridData, const uint32_t *lut)
+
+/// @brief
+/// @param gridData
+/// @return
+inline __hostdev__ uint32_t crc32Head(const GridData *gridData)
+{
+    NANOVDB_ASSERT(gridData);
+    const uint8_t *begin = (const uint8_t*)(gridData), *mid = begin + sizeof(GridData) + sizeof(TreeData);
+    if (gridData->mVersion <= Version(32,6,0)) mid = (const uint8_t*)(gridData->template nodePtr<2>());
+    return util::crc32(begin + 16, mid);// exclude GridData::mMagic and GridData::mChecksum
+}// uint32_t crc32Head(const GridData *gridData)
+
+// ----------------------------> crc32TailOld <--------------------------------------
+
+// Old checksum
+template <typename ValueT>
+uint32_t crc32TailOld(const NanoGrid<ValueT> *grid, const uint32_t *lut)
+{
+    NANOVDB_ASSERT(grid->mVersion <= Version(32,6,0));
+    const auto &tree = grid->tree();
+    auto nodeMgrHandle = createNodeManager(*grid);
+    auto *nodeMgr = nodeMgrHandle.template mgr<ValueT>();
+    assert(nodeMgr && isAligned(nodeMgr));
+    const auto nodeCount = tree.nodeCount(0) + tree.nodeCount(1) + tree.nodeCount(2);
+    std::vector<uint32_t> checksums(nodeCount, 0);
+    util::forEach(0, tree.nodeCount(2), 1,[&](const util::Range1D &r) {// process upper internal nodes
+        uint32_t *p = checksums.data() + r.begin();
+        for (auto i = r.begin(); i != r.end(); ++i) {
+            const auto &node = nodeMgr->upper(static_cast<uint32_t>(i));
+            *p++ = util::crc32(&node, node.memUsage(), lut);
+        }
+    });
+    util::forEach(0, tree.nodeCount(1), 1, [&](const util::Range1D &r) { // process lower internal nodes
+        uint32_t *p = checksums.data() + r.begin() + tree.nodeCount(2);
+        for (auto i = r.begin(); i != r.end(); ++i) {
+            const auto &node = nodeMgr->lower(static_cast<uint32_t>(i));
+            *p++ = util::crc32(&node, node.memUsage(), lut);
+        }
+    });
+    util::forEach(0, tree.nodeCount(0), 8, [&](const util::Range1D &r) { // process leaf nodes
+        uint32_t *p = checksums.data() + r.begin() + tree.nodeCount(1) + tree.nodeCount(2);
+        for (auto i = r.begin(); i != r.end(); ++i) {
+            const auto &leaf = nodeMgr->leaf(static_cast<uint32_t>(i));
+            *p++ = util::crc32(&leaf, leaf.memUsage(), lut);
+        }
+    });
+    return util::crc32(checksums.data(), sizeof(uint32_t)*checksums.size(), lut);
+}// uint32_t crc32TailOld(const NanoGrid<ValueT> *grid, const uint32_t *lut)
+
+struct Crc32TailOld {
+    template <typename BuildT>
+    static uint32_t   known(const GridData *gridData, const uint32_t *lut)
+    {
+        return crc32TailOld((const NanoGrid<BuildT>*)gridData, lut);
+    }
+    static uint32_t unknown(const GridData*, const uint32_t*)
+    {
+        throw std::runtime_error("Cannot call Crc32TailOld with grid of unknown type");
+        return 0u;//dummy
+    }
+};// struct Crc32TailOld
+
+inline uint32_t crc32Tail(const GridData *gridData, const uint32_t *lut)
+{
+    NANOVDB_ASSERT(gridData);
+    if (gridData->mVersion > Version(32,6,0)) {
+        const uint8_t *begin = (const uint8_t*)(gridData);
+        return util::blockedCrc32(begin + sizeof(GridData) + sizeof(TreeData), begin + gridData->mGridSize, lut);
+    } else {
+        return callNanoGrid<Crc32TailOld>(gridData, lut);
+    }
+}// uint32_t crc32Tail(const GridData *gridData, const uint32_t *lut)
+
+template <typename ValueT>
+uint32_t crc32Tail(const NanoGrid<ValueT> *grid, const uint32_t *lut)
+{
+    NANOVDB_ASSERT(grid);
+    if (grid->mVersion > Version(32,6,0)) {
+        const uint8_t *begin = (const uint8_t*)(grid);
+        return util::blockedCrc32(begin + sizeof(GridData) + sizeof(TreeData), begin + grid->mGridSize, lut);
+    } else {
+        return crc32TailOld(grid, lut);
+    }
+}// uint32_t crc32Tail(const NanoGrid<ValueT> *gridData, const uint32_t *lut)
+
+// ----------------------------> evalChecksum <--------------------------------------
+
+/// @brief
+/// @tparam ValueT
+/// @param grid
+/// @param mode
+/// @return
+template <typename ValueT>
+Checksum evalChecksum(const NanoGrid<ValueT> *grid, CheckMode mode)
+{
+    NANOVDB_ASSERT(grid);
+    Checksum cs;
+    if (mode != CheckMode::Empty) {
+        auto lut  = util::createCrc32Lut();
+        cs.head() = crc32Head(grid, lut.get());
+        if (mode == CheckMode::Full) cs.tail() = crc32Tail(grid, lut.get());
+    }
+    return cs;
+}// checksum(const NanoGrid*, CheckMode)
+
+template <typename ValueT>
+[[deprecated("Use evalChecksum(const NanoGrid<ValueT> *grid, CheckMode mode) instead")]]
+Checksum checksum(const NanoGrid<ValueT> *grid, CheckMode mode){return evalChecksum(grid, mode);}
+
+inline Checksum evalChecksum(const GridData *gridData, CheckMode mode)
+{
+    NANOVDB_ASSERT(gridData);
+    Checksum cs;
+    if (mode != CheckMode::Disable) {
+        auto lut  = util::createCrc32Lut();
+        cs.head() = crc32Head(gridData, lut.get());
+        if (mode == CheckMode::Full) cs.tail() = crc32Tail(gridData, lut.get());
+    }
+    return cs;
+}// evalChecksum(GridData *data, CheckMode mode)
+
+[[deprecated("Use evalChecksum(const NanoGrid*, CheckMode) instead")]]
+inline Checksum checksum(const GridData *gridData, CheckMode mode){return evalChecksum(gridData, mode);}
+
+template <typename ValueT>
+[[deprecated("Use checksum(const NanoGrid*, CheckMode) instead")]]
+Checksum checksum(const NanoGrid<ValueT> &grid, CheckMode mode){return checksum(&grid, mode);}
+
+// ----------------------------> validateChecksum <--------------------------------------
+
+/// @brief
+/// @tparam ValueT
+/// @param grid
+/// @param mode
+/// @return
+template <typename ValueT>
+bool validateChecksum(const NanoGrid<ValueT> *grid, CheckMode mode)
+{
+    if (grid->mChecksum.isEmpty() || mode == CheckMode::Empty) return true;
+    auto lut = util::createCrc32Lut();
+    bool checkHead = grid->mChecksum.head() == crc32Head(grid->data(), lut.get());
+    if (grid->mChecksum.isHalf() || mode == CheckMode::Half || !checkHead) {
+        return checkHead;
+    } else {
+        return grid->mChecksum.tail() == crc32Tail(grid, lut.get());
+    }
+}
+
+/// @brief
+/// @tparam ValueT
+/// @param grid
+/// @param mode
+/// @return
+inline bool validateChecksum(const GridData *gridData, CheckMode mode)
+{
+    if (gridData->mChecksum.isEmpty()|| mode == CheckMode::Empty) return true;
+    auto lut = util::createCrc32Lut();
+    bool checkHead = gridData->mChecksum.head() == crc32Head(gridData, lut.get());
+    if (gridData->mChecksum.isHalf() || mode == CheckMode::Half || !checkHead) {
+        return checkHead;
+    } else {
+        return gridData->mChecksum.tail() == crc32Tail(gridData, lut.get());
+    }
+}//  bool validateChecksum(const GridData *gridData, CheckMode mode)
+
+template <typename ValueT>
+[[deprecated("Use validateChecksum(const NanoGrid*, CheckMode) instead")]]
+bool validateChecksum(const NanoGrid<ValueT> &grid, CheckMode mode){return validateChecksum(&grid, mode);}
+
+// ----------------------------> updateChecksum <--------------------------------------
+
+/// @brief
+/// @tparam ValueT
+/// @param grid
+/// @param mode
+template <typename ValueT>
+void updateChecksum(NanoGrid<ValueT> *grid, CheckMode mode){grid->mChecksum = evalChecksum(grid, mode);}
+
+template <typename ValueT>
+void updateChecksum(NanoGrid<ValueT> *grid){grid->mChecksum = evalChecksum(grid, grid->mChecksum.mode());}
+
+// deprecated method that takes a reference vs a pointer
+template <typename ValueT>
+[[deprecated("Use updateChecksum(const NanoGrid*, CheckMode) instead")]]
+void updateChecksum(NanoGrid<ValueT> &grid, CheckMode mode){updateChecksum(&grid, mode);}
+
+// ----------------------------> updateGridCount <--------------------------------------
+
+/// @brief Updates the ground index and count, as well as the head checksum if needed
+/// @param data Pointer to grid data
+/// @param gridIndex New value of the index
+/// @param gridCount New value of the grid count
+inline void updateGridCount(GridData *data, uint32_t gridIndex, uint32_t gridCount)
+{
+    NANOVDB_ASSERT(data && gridIndex < gridCount);
+    if (data->mGridIndex != gridIndex || data->mGridCount != gridCount) {
+        data->mGridIndex  = gridIndex;
+        data->mGridCount  = gridCount;
+        if (!data->mChecksum.isEmpty()) data->mChecksum.head() = crc32Head(data);
+    }
+}
+
+} // namespace tools ======================================================================
+
+
+} // namespace nanovdb ====================================================================
+
+#endif // NANOVDB_TOOLS_GRIDCHECKSUM_H_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/tools/GridStats.h b/nanovdb/nanovdb/tools/GridStats.h
new file mode 100644
index 0000000000..f07f5a6040
--- /dev/null
+++ b/nanovdb/nanovdb/tools/GridStats.h
@@ -0,0 +1,877 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    \file nanovdb/tools/GridStats.h
+
+    \author Ken Museth
+
+    \date August 29, 2020
+
+    \brief Re-computes min/max/avg/var/bbox information for each node in a
+           pre-existing NanoVDB grid.
+*/
+
+#ifndef NANOVDB_TOOLS_GRIDSTATS_H_HAS_BEEN_INCLUDED
+#define NANOVDB_TOOLS_GRIDSTATS_H_HAS_BEEN_INCLUDED
+
+#include <nanovdb/NanoVDB.h>
+
+#ifdef NANOVDB_USE_TBB
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_reduce.h>
+#endif
+
+#if defined(__CUDACC__)
+#include <cuda/std/limits>// for cuda::std::numeric_limits
+#else
+#include <limits.h>// for std::numeric_limits
+#endif
+
+#include <atomic>
+#include <iostream>
+
+namespace nanovdb {
+
+namespace tools {//=======================================================================
+
+/// @brief Grid flags which indicate what extra information is present in the grid buffer
+enum class StatsMode : uint32_t {
+    Disable = 0,// disable the computation of any type of statistics (obviously the FASTEST!)
+    BBox    = 1,// only compute the bbox of active values per node and total activeVoxelCount
+    MinMax  = 2,// additionally compute extrema values
+    All     = 3,// compute all of the statics, i.e. bbox, min/max, average and standard deviation
+    Default = 3,// default computational mode for statistics
+    End     = 4,
+};
+
+/// @brief Re-computes the min/max, stats and bbox information for an existing NanoVDB Grid
+/// @param grid  Grid whose stats to update
+/// @param mode  Mode of computation for the statistics.
+template<typename BuildT>
+void updateGridStats(NanoGrid<BuildT>* grid, StatsMode mode = StatsMode::Default);
+
+template<typename ValueT, int Rank = TensorTraits<ValueT>::Rank>
+class Extrema;
+
+/// @brief Determine the extrema of all the values in a grid that
+///        intersects the specified bounding box.
+/// @tparam BuildT Build type of the input grid
+/// @param grid typed grid
+/// @param bbox index bounding box in which min/max are computed
+/// @return Extream of values insixe @c bbox
+template<typename BuildT>
+Extrema<typename NanoGrid<BuildT>::ValueType>
+getExtrema(const NanoGrid<BuildT>& grid, const CoordBBox &bbox);
+
+//================================================================================================
+
+/// @brief Template specialization of Extrema on scalar value types, i.e. rank = 0
+template<typename ValueT>
+class Extrema<ValueT, 0>
+{
+protected:
+    ValueT mMin, mMax;
+
+public:
+    using ValueType = ValueT;
+    __hostdev__ Extrema()
+#if defined(__CUDACC__)
+        // note "::cuda" is needed since we also define a cuda namespace
+        : mMin(::cuda::std::numeric_limits<ValueT>::max())
+        , mMax(::cuda::std::numeric_limits<ValueT>::lowest())
+#else
+        : mMin(std::numeric_limits<ValueT>::max())
+        , mMax(std::numeric_limits<ValueT>::lowest())
+#endif
+    {
+    }
+    __hostdev__ Extrema(const ValueT& v)
+        : mMin(v)
+        , mMax(v)
+    {
+    }
+    __hostdev__ Extrema(const ValueT& a, const ValueT& b)
+        : mMin(a)
+        , mMax(b)
+    {
+    }
+    __hostdev__ Extrema& min(const ValueT& v)
+    {
+        if (v < mMin) mMin = v;
+        return *this;
+    }
+    __hostdev__ Extrema& max(const ValueT& v)
+    {
+        if (v > mMax) mMax = v;
+        return *this;
+    }
+    __hostdev__ Extrema& add(const ValueT& v)
+    {
+        this->min(v);
+        this->max(v);
+        return *this;
+    }
+    __hostdev__ Extrema& add(const ValueT& v, uint64_t) { return this->add(v); }
+    __hostdev__ Extrema& add(const Extrema& other)
+    {
+        this->min(other.mMin);
+        this->max(other.mMax);
+        return *this;
+    }
+    __hostdev__ const ValueT& min() const { return mMin; }
+    __hostdev__ const ValueT& max() const { return mMax; }
+    __hostdev__ operator bool() const { return mMin <= mMax; }
+    __hostdev__ static constexpr bool hasMinMax() { return !util::is_same<bool, ValueT>::value; }
+    __hostdev__ static constexpr bool hasAverage() { return false; }
+    __hostdev__ static constexpr bool hasStdDeviation() { return false; }
+    __hostdev__ static constexpr bool hasStats() { return !util::is_same<bool, ValueT>::value; }
+    __hostdev__ static constexpr size_t size() { return 0; }
+
+    template <typename NodeT>
+    __hostdev__ void setStats(NodeT &node) const
+    {
+        node.setMin(this->min());
+        node.setMax(this->max());
+    }
+}; // Extrema<T, 0>
+
+/// @brief Template specialization of Extrema on vector value types, i.e. rank = 1
+template<typename VecT>
+class Extrema<VecT, 1>
+{
+protected:
+    using Real = typename VecT::ValueType; // this works with both nanovdb and openvdb vectors
+    struct Pair
+    {
+        Real scalar;
+        VecT vector;
+
+        __hostdev__ Pair(Real s)// is only used by Extrema() default c-tor
+            : scalar(s)
+            , vector(s)
+        {
+        }
+        __hostdev__ Pair(const VecT& v)
+            : scalar(v.lengthSqr())
+            , vector(v)
+        {
+        }
+        __hostdev__ bool  operator<(const Pair& rhs) const { return scalar < rhs.scalar; }
+    } mMin, mMax;
+    __hostdev__ Extrema& add(const Pair& p)
+    {
+        if (p < mMin) mMin = p;
+        if (mMax < p) mMax = p;
+        return *this;
+    }
+
+public:
+    using ValueType = VecT;
+    __hostdev__ Extrema()
+#if defined(__CUDACC__)
+        // note "::cuda" is needed since we also define a cuda namespace
+        : mMin(::cuda::std::numeric_limits<Real>::max())
+        , mMax(::cuda::std::numeric_limits<Real>::lowest())
+#else
+        : mMin(std::numeric_limits<Real>::max())
+        , mMax(std::numeric_limits<Real>::lowest())
+#endif
+    {
+    }
+    __hostdev__ Extrema(const VecT& v)
+        : mMin(v)
+        , mMax(v)
+    {
+    }
+    __hostdev__ Extrema(const VecT& a, const VecT& b)
+        : mMin(a)
+        , mMax(b)
+    {
+    }
+    __hostdev__ Extrema& min(const VecT& v)
+    {
+        Pair tmp(v);
+        if (tmp < mMin) mMin = tmp;
+        return *this;
+    }
+    __hostdev__ Extrema& max(const VecT& v)
+    {
+        Pair tmp(v);
+        if (mMax < tmp) mMax = tmp;
+        return *this;
+    }
+    __hostdev__ Extrema& add(const VecT& v) { return this->add(Pair(v)); }
+    __hostdev__ Extrema& add(const VecT& v, uint64_t) { return this->add(Pair(v)); }
+    __hostdev__ Extrema& add(const Extrema& other)
+    {
+        if (other.mMin < mMin) mMin = other.mMin;
+        if (mMax < other.mMax) mMax = other.mMax;
+        return *this;
+    }
+    __hostdev__ const VecT& min() const { return mMin.vector; }
+    __hostdev__ const VecT& max() const { return mMax.vector; }
+    __hostdev__ operator bool() const { return !(mMax < mMin); }
+    __hostdev__ static constexpr bool hasMinMax() { return !util::is_same<bool, Real>::value; }
+    __hostdev__ static constexpr bool hasAverage() { return false; }
+    __hostdev__ static constexpr bool hasStdDeviation() { return false; }
+    __hostdev__ static constexpr bool hasStats() { return !util::is_same<bool, Real>::value; }
+    __hostdev__ static constexpr size_t size() { return 0; }
+
+    template <typename NodeT>
+    __hostdev__ void setStats(NodeT &node) const
+    {
+        node.setMin(this->min());
+        node.setMax(this->max());
+    }
+}; // Extrema<T, 1>
+
+//================================================================================================
+
+template<typename ValueT, int Rank = TensorTraits<ValueT>::Rank>
+class Stats;
+
+/// @brief This class computes statistics (minimum value, maximum
+/// value, mean, variance and standard deviation) of a population
+/// of floating-point values.
+///
+/// @details variance = Mean[ (X-Mean[X])^2 ] = Mean[X^2] - Mean[X]^2,
+///          standard deviation = sqrt(variance)
+///
+/// @note This class employs incremental computation and double precision.
+template<typename ValueT>
+class Stats<ValueT, 0> : public Extrema<ValueT, 0>
+{
+protected:
+    using BaseT = Extrema<ValueT, 0>;
+    using RealT = double; // for accuracy the internal precission must be 64 bit floats
+    size_t mSize;
+    double mAvg, mAux;
+
+public:
+    using ValueType = ValueT;
+    __hostdev__ Stats()
+        : BaseT()
+        , mSize(0)
+        , mAvg(0.0)
+        , mAux(0.0)
+    {
+    }
+    __hostdev__ Stats(const ValueT& val)
+        : BaseT(val)
+        , mSize(1)
+        , mAvg(RealT(val))
+        , mAux(0.0)
+    {
+    }
+    /// @brief Add a single sample
+    __hostdev__ Stats& add(const ValueT& val)
+    {
+        BaseT::add(val);
+        mSize += 1;
+        const double delta = double(val) - mAvg;
+        mAvg += delta / double(mSize);
+        mAux += delta * (double(val) - mAvg);
+        return *this;
+    }
+    /// @brief Add @a n samples with constant value @a val.
+    __hostdev__ Stats& add(const ValueT& val, uint64_t n)
+    {
+        const double denom = 1.0 / double(mSize + n);
+        const double delta = double(val) - mAvg;
+        mAvg += denom * delta * double(n);
+        mAux += denom * delta * delta * double(mSize) * double(n);
+        BaseT::add(val);
+        mSize += n;
+        return *this;
+    }
+
+    /// Add the samples from the other Stats instance.
+    __hostdev__ Stats& add(const Stats& other)
+    {
+        if (other.mSize > 0) {
+            const double denom = 1.0 / double(mSize + other.mSize);
+            const double delta = other.mAvg - mAvg;
+            mAvg += denom * delta * double(other.mSize);
+            mAux += other.mAux + denom * delta * delta * double(mSize) * double(other.mSize);
+            BaseT::add(other);
+            mSize += other.mSize;
+        }
+        return *this;
+    }
+
+    __hostdev__ static constexpr bool hasMinMax() { return !util::is_same<bool, ValueT>::value; }
+    __hostdev__ static constexpr bool hasAverage() { return !util::is_same<bool, ValueT>::value; }
+    __hostdev__ static constexpr bool hasStdDeviation() { return !util::is_same<bool, ValueT>::value; }
+    __hostdev__ static constexpr bool hasStats() { return !util::is_same<bool, ValueT>::value; }
+
+    __hostdev__ size_t size() const { return mSize; }
+
+    //@{
+    /// Return the  arithmetic mean, i.e. average, value.
+    __hostdev__ double avg() const { return mAvg; }
+    __hostdev__ double mean() const { return mAvg; }
+    //@}
+
+    //@{
+    /// @brief Return the population variance.
+    ///
+    /// @note The unbiased sample variance = population variance * num/(num-1)
+    __hostdev__ double var() const { return mSize < 2 ? 0.0 : mAux / double(mSize); }
+    __hostdev__ double variance() const { return this->var(); }
+    //@}
+
+    //@{
+    /// @brief Return the standard deviation (=Sqrt(variance)) as
+    ///        defined from the (biased) population variance.
+    __hostdev__ double std() const { return sqrt(this->var()); }
+    __hostdev__ double stdDev() const { return this->std(); }
+    //@}
+
+    template <typename NodeT>
+    __hostdev__ void setStats(NodeT &node) const
+    {
+        node.setMin(this->min());
+        node.setMax(this->max());
+        node.setAvg(this->avg());
+        node.setDev(this->std());
+    }
+}; // end Stats<T, 0>
+
+/// @brief This class computes statistics (minimum value, maximum
+/// value, mean, variance and standard deviation) of a population
+/// of floating-point values.
+///
+/// @details variance = Mean[ (X-Mean[X])^2 ] = Mean[X^2] - Mean[X]^2,
+///          standard deviation = sqrt(variance)
+///
+/// @note This class employs incremental computation and double precision.
+template<typename ValueT>
+class Stats<ValueT, 1> : public Extrema<ValueT, 1>
+{
+protected:
+    using BaseT = Extrema<ValueT, 1>;
+    using RealT = double; // for accuracy the internal precision must be 64 bit floats
+    size_t mSize;
+    double mAvg, mAux;
+
+public:
+    using ValueType = ValueT;
+    __hostdev__ Stats()
+        : BaseT()
+        , mSize(0)
+        , mAvg(0.0)
+        , mAux(0.0)
+    {
+    }
+    /// @brief Add a single sample
+    __hostdev__ Stats& add(const ValueT& val)
+    {
+        typename BaseT::Pair tmp(val);
+        BaseT::add(tmp);
+        mSize += 1;
+        const double delta = tmp.scalar - mAvg;
+        mAvg += delta / double(mSize);
+        mAux += delta * (tmp.scalar - mAvg);
+        return *this;
+    }
+    /// @brief Add @a n samples with constant value @a val.
+    __hostdev__ Stats& add(const ValueT& val, uint64_t n)
+    {
+        typename BaseT::Pair tmp(val);
+        const double         denom = 1.0 / double(mSize + n);
+        const double         delta = tmp.scalar - mAvg;
+        mAvg += denom * delta * double(n);
+        mAux += denom * delta * delta * double(mSize) * double(n);
+        BaseT::add(tmp);
+        mSize += n;
+        return *this;
+    }
+
+    /// Add the samples from the other Stats instance.
+    __hostdev__ Stats& add(const Stats& other)
+    {
+        if (other.mSize > 0) {
+            const double denom = 1.0 / double(mSize + other.mSize);
+            const double delta = other.mAvg - mAvg;
+            mAvg += denom * delta * double(other.mSize);
+            mAux += other.mAux + denom * delta * delta * double(mSize) * double(other.mSize);
+            BaseT::add(other);
+            mSize += other.mSize;
+        }
+        return *this;
+    }
+
+    __hostdev__ static constexpr bool hasMinMax() { return !util::is_same<bool, ValueT>::value; }
+    __hostdev__ static constexpr bool hasAverage() { return !util::is_same<bool, ValueT>::value; }
+    __hostdev__ static constexpr bool hasStdDeviation() { return !util::is_same<bool, ValueT>::value; }
+    __hostdev__ static constexpr bool hasStats() { return !util::is_same<bool, ValueT>::value; }
+
+    __hostdev__ size_t size() const { return mSize; }
+
+    //@{
+    /// Return the  arithmetic mean, i.e. average, value.
+    __hostdev__ double avg() const { return mAvg; }
+    __hostdev__ double mean() const { return mAvg; }
+    //@}
+
+    //@{
+    /// @brief Return the population variance.
+    ///
+    /// @note The unbiased sample variance = population variance * num/(num-1)
+    __hostdev__ double var() const { return mSize < 2 ? 0.0 : mAux / double(mSize); }
+    __hostdev__ double variance() const { return this->var(); }
+    //@}
+
+    //@{
+    /// @brief Return the standard deviation (=Sqrt(variance)) as
+    ///        defined from the (biased) population variance.
+    __hostdev__ double std() const { return sqrt(this->var()); }
+    __hostdev__ double stdDev() const { return this->std(); }
+    //@}
+
+    template <typename NodeT>
+    __hostdev__ void setStats(NodeT &node) const
+    {
+        node.setMin(this->min());
+        node.setMax(this->max());
+        node.setAvg(this->avg());
+        node.setDev(this->std());
+    }
+}; // end Stats<T, 1>
+
+/// @brief No-op Stats class
+template<typename ValueT>
+struct NoopStats
+{
+    using ValueType = ValueT;
+    __hostdev__ NoopStats() {}
+    __hostdev__ NoopStats(const ValueT&) {}
+    __hostdev__ NoopStats& add(const ValueT&) { return *this; }
+    __hostdev__ NoopStats& add(const ValueT&, uint64_t) { return *this; }
+    __hostdev__ NoopStats& add(const NoopStats&) { return *this; }
+    __hostdev__ static constexpr size_t size() { return 0; }
+    __hostdev__ static constexpr bool hasMinMax() { return false; }
+    __hostdev__ static constexpr bool hasAverage() { return false; }
+    __hostdev__ static constexpr bool hasStdDeviation() { return false; }
+    __hostdev__ static constexpr bool hasStats() { return false; }
+    template <typename NodeT>
+    __hostdev__ void setStats(NodeT&) const{}
+}; // end NoopStats<T>
+
+//================================================================================================
+
+/// @brief Allows for the construction of NanoVDB grids without any dependency
+template<typename GridT, typename StatsT = Stats<typename GridT::ValueType>>
+class GridStats
+{
+    struct NodeStats;
+    using TreeT  = typename GridT::TreeType;
+    using ValueT = typename TreeT::ValueType;
+    using BuildT = typename TreeT::BuildType;
+    using Node0  = typename TreeT::Node0; // leaf
+    using Node1  = typename TreeT::Node1; // lower
+    using Node2  = typename TreeT::Node2; // upper
+    using RootT  = typename TreeT::Node3; // root
+    static_assert(util::is_same<ValueT, typename StatsT::ValueType>::value, "Mismatching type");
+
+    ValueT mDelta; // skip rendering of node if: node.max < -mDelta || node.min > mDelta
+
+    void process( GridT& );// process grid and all tree nodes
+    void process( TreeT& );// process Tree, root node and child nodes
+    void process( RootT& );// process root node and child nodes
+    NodeStats process( Node0& );// process leaf node
+
+    template<typename NodeT>
+    NodeStats process( NodeT& );// process internal node and child nodes
+
+    template<typename DataT, int Rank>
+    void setStats(DataT*, const Extrema<ValueT, Rank>&);
+    template<typename DataT, int Rank>
+    void setStats(DataT*, const Stats<ValueT, Rank>&);
+    template<typename DataT>
+    void setStats(DataT*, const NoopStats<ValueT>&) {}
+
+    template<typename T, typename FlagT>
+    typename std::enable_if<!std::is_floating_point<T>::value>::type
+    setFlag(const T&, const T&, FlagT& flag) const { flag &= ~FlagT(1); } // unset 1st bit to enable rendering
+
+    template<typename T, typename FlagT>
+    typename std::enable_if<std::is_floating_point<T>::value>::type
+    setFlag(const T& min, const T& max, FlagT& flag) const;
+
+public:
+    GridStats() = default;
+
+    void update(GridT& grid, ValueT delta = ValueT(0));
+
+}; // GridStats
+
+template<typename GridT, typename StatsT>
+struct GridStats<GridT, StatsT>::NodeStats
+{
+    StatsT    stats;
+    CoordBBox bbox;
+
+    NodeStats(): stats(), bbox() {}//activeCount(0), bbox() {};
+
+    NodeStats& add(const NodeStats &other)
+    {
+        stats.add( other.stats );// no-op for NoopStats?!
+        bbox[0].minComponent(other.bbox[0]);
+        bbox[1].maxComponent(other.bbox[1]);
+        return *this;
+    }
+};// GridStats::NodeStats
+
+//================================================================================================
+
+template<typename GridT, typename StatsT>
+void GridStats<GridT, StatsT>::update(GridT& grid, ValueT delta)
+{
+    mDelta = delta; // delta = voxel size for level sets, else 0
+    this->process( grid );
+}
+
+//================================================================================================
+
+template<typename GridT, typename StatsT>
+template<typename DataT, int Rank>
+inline void GridStats<GridT, StatsT>::
+    setStats(DataT* data, const Extrema<ValueT, Rank>& e)
+{
+    data->setMin(e.min());
+    data->setMax(e.max());
+}
+
+template<typename GridT, typename StatsT>
+template<typename DataT, int Rank>
+inline void GridStats<GridT, StatsT>::
+    setStats(DataT* data, const Stats<ValueT, Rank>& s)
+{
+    data->setMin(s.min());
+    data->setMax(s.max());
+    data->setAvg(s.avg());
+    data->setDev(s.std());
+}
+
+//================================================================================================
+
+template<typename GridT, typename StatsT>
+template<typename T, typename FlagT>
+inline typename std::enable_if<std::is_floating_point<T>::value>::type
+GridStats<GridT, StatsT>::
+    setFlag(const T& min, const T& max, FlagT& flag) const
+{
+    if (mDelta > 0 && (min > mDelta || max < -mDelta)) {// LS: min > dx || max < -dx
+        flag |=  FlagT(1u);// set 1st bit to disable rendering
+    } else {
+        flag &= ~FlagT(1u);// unset 1st bit to enable rendering
+    }
+}
+
+//================================================================================================
+
+template<typename GridT, typename StatsT>
+void GridStats<GridT, StatsT>::process( GridT &grid )
+{
+    this->process( grid.tree() );// this processes tree, root and all nodes
+
+    // set world space AABB
+    auto& data = *grid.data();
+    const auto& indexBBox = grid.tree().root().bbox();
+    if (indexBBox.empty()) {
+        data.mWorldBBox = Vec3dBBox();
+        data.setBBoxOn(false);
+    } else {
+        // Note that below max is offset by one since CoordBBox.max is inclusive
+        // while bbox<Vec3d>.max is exclusive. However, min is inclusive in both
+        // CoordBBox and Vec3dBBox. This also guarantees that a grid with a single
+        // active voxel, does not have an empty world bbox! E.g. if a grid with a
+        // unit index-to-world transformation only contains the active voxel (0,0,0)
+        // then indeBBox = (0,0,0) -> (0,0,0) and then worldBBox = (0.0, 0.0, 0.0)
+        // -> (1.0, 1.0, 1.0). This is a consequence of the different definitions
+        // of index and world bounding boxes inherited from OpenVDB!
+        grid.mWorldBBox = CoordBBox(indexBBox[0], indexBBox[1].offsetBy(1)).transform(grid.map());
+        grid.setBBoxOn(true);
+    }
+
+    // set bit flags
+    data.setMinMaxOn(StatsT::hasMinMax());
+    data.setAverageOn(StatsT::hasAverage());
+    data.setStdDeviationOn(StatsT::hasStdDeviation());
+} // GridStats::process( Grid )
+
+//================================================================================================
+
+template<typename GridT, typename StatsT>
+inline void GridStats<GridT, StatsT>::process( typename GridT::TreeType &tree )
+{
+    this->process( tree.root() );
+}
+
+//================================================================================================
+
+template<typename GridT, typename StatsT>
+void GridStats<GridT, StatsT>::process(RootT &root)
+{
+    using ChildT = Node2;
+    auto     &data = *root.data();
+    if (data.mTableSize == 0) { // empty root node
+        data.mMinimum = data.mMaximum = data.mBackground;
+        data.mAverage = data.mStdDevi = 0;
+        data.mBBox = CoordBBox();
+    } else {
+        NodeStats total;
+        for (uint32_t i = 0; i < data.mTableSize; ++i) {
+            auto* tile = data.tile(i);
+            if (tile->isChild()) { // process child node
+                total.add( this->process( *data.getChild(tile) ) );
+            } else if (tile->state) { // active tile
+                const Coord ijk = tile->origin();
+                total.bbox[0].minComponent(ijk);
+                total.bbox[1].maxComponent(ijk + Coord(ChildT::DIM - 1));
+                if (StatsT::hasStats()) { // resolved at compile time
+                    total.stats.add(tile->value, ChildT::NUM_VALUES);
+                }
+            }
+        }
+        this->setStats(&data, total.stats);
+        if (total.bbox.empty()) {
+            std::cerr << "\nWarning in GridStats: input tree only contained inactive root tiles!"
+                      << "\nWhile not strictly an error it's rather suspicious!\n";
+        }
+        data.mBBox = total.bbox;
+    }
+} // GridStats::process( RootNode )
+
+//================================================================================================
+
+template<typename GridT, typename StatsT>
+template<typename NodeT>
+typename GridStats<GridT, StatsT>::NodeStats
+GridStats<GridT, StatsT>::process(NodeT &node)
+{
+    static_assert(util::is_same<NodeT,Node1>::value || util::is_same<NodeT,Node2>::value, "Incorrect node type");
+    using ChildT = typename NodeT::ChildNodeType;
+
+    NodeStats total;
+    auto* data = node.data();
+
+    // Serial processing of active tiles
+    if (const auto tileCount = data->mValueMask.countOn()) {
+        //total.activeCount = tileCount * ChildT::NUM_VALUES; // active tiles
+        for (auto it = data->mValueMask.beginOn(); it; ++it) {
+            if (StatsT::hasStats()) { // resolved at compile time
+                total.stats.add( data->mTable[*it].value, ChildT::NUM_VALUES );
+            }
+            const Coord ijk = node.offsetToGlobalCoord(*it);
+            total.bbox[0].minComponent(ijk);
+            total.bbox[1].maxComponent(ijk + Coord(int32_t(ChildT::DIM) - 1));
+        }
+    }
+
+    // Serial or parallel processing of child nodes
+    if (const size_t childCount = data->mChildMask.countOn()) {
+#ifndef NANOVDB_USE_TBB
+        for (auto it = data->mChildMask.beginOn(); it; ++it) {
+            total.add( this->process( *data->getChild(*it) ) );
+        }
+#else
+        std::unique_ptr<ChildT*[]> childNodes(new ChildT*[childCount]);
+        ChildT **ptr = childNodes.get();
+        for (auto it = data->mChildMask.beginOn(); it; ++it) {
+            *ptr++ = data->getChild( *it );
+        }
+        using RangeT = tbb::blocked_range<size_t>;
+        total.add( tbb::parallel_reduce(RangeT(0, childCount), NodeStats(),
+            [&](const RangeT &r, NodeStats local)->NodeStats {
+                for(size_t i=r.begin(); i!=r.end(); ++i){
+                    local.add( this->process( *childNodes[i] ) );
+                }
+                return local;},
+            [](NodeStats a, const NodeStats &b)->NodeStats { return a.add( b ); }
+        ));
+#endif
+    }
+
+    data->mBBox = total.bbox;
+    if (total.bbox.empty()) {
+        data->mFlags |=  uint32_t(1); // set 1st bit on to disable rendering of node
+        data->mFlags &= ~uint32_t(2); // set 2nd bit off since node does not contain active values
+    } else {
+        data->mFlags |=  uint32_t(2); // set 2nd bit on since node contains active values
+        if (StatsT::hasStats()) { // resolved at compile time
+            this->setStats(data, total.stats);
+            this->setFlag(data->mMinimum, data->mMaximum, data->mFlags);
+        }
+    }
+    return total;
+} // GridStats::process( InternalNode )
+
+//================================================================================================
+
+template<typename GridT, typename StatsT>
+typename GridStats<GridT, StatsT>::NodeStats
+GridStats<GridT, StatsT>::process(Node0 &leaf)
+{
+    NodeStats local;
+    if (leaf.updateBBox()) {// optionally update active bounding box (updates data->mFlags)
+        local.bbox[0] = local.bbox[1] = leaf.mBBoxMin;
+        local.bbox[1] += Coord(leaf.mBBoxDif[0], leaf.mBBoxDif[1], leaf.mBBoxDif[2]);
+        if (StatsT::hasStats()) {// resolved at compile time
+            for (auto it = leaf.cbeginValueOn(); it; ++it) local.stats.add(*it);
+            this->setStats(&leaf, local.stats);
+            this->setFlag(leaf.getMin(), leaf.getMax(), leaf.mFlags);
+        }
+    }
+    return local;
+} // GridStats::process( LeafNode )
+
+//================================================================================================
+
+template<typename BuildT>
+void updateGridStats(NanoGrid<BuildT>* grid, StatsMode mode)
+{
+    NANOVDB_ASSERT(grid);
+    using GridT  = NanoGrid<BuildT>;
+    using ValueT = typename GridT::ValueType;
+    if (mode == StatsMode::Disable) {
+        return;
+    } else if (mode == StatsMode::BBox || util::is_same<bool, ValueT>::value) {
+        GridStats<GridT, NoopStats<ValueT> > stats;
+        stats.update(*grid);
+    } else if (mode == StatsMode::MinMax) {
+        GridStats<GridT, Extrema<ValueT> > stats;
+        stats.update(*grid);
+    } else if (mode == StatsMode::All) {
+        GridStats<GridT, Stats<ValueT> > stats;
+        stats.update(*grid);
+    } else {
+        throw std::runtime_error("gridStats: Unsupported statistics mode.");
+    }
+}// updateGridStats
+
+template<typename BuildT>
+[[deprecated("Use nanovdb::tools::updateGridStats(NanoGrid*, StatsMode) instead")]]
+void gridStats(NanoGrid<BuildT>& grid, StatsMode mode = StatsMode::Default)
+{
+    updateGridStats<BuildT>(&grid, mode);
+}
+
+//================================================================================================
+
+namespace {
+
+// returns a bitmask (of size 32^3 or 16^3) that marks all the entries
+// in a node table that intersects with the specified bounding box.
+template<typename NodeT>
+Mask<NodeT::LOG2DIM> getBBoxMask(const CoordBBox &bbox, const NodeT* node)
+{
+    Mask<NodeT::LOG2DIM> mask;// typically 32^3 or 16^3 bit mask
+    auto b = CoordBBox::createCube(node->origin(), node->dim());
+    assert( bbox.hasOverlap(b) );
+    if ( bbox.isInside(b) ) {
+        mask.setOn();//node is completely inside the bbox so early out
+    } else {
+        b.intersect(bbox);// trim bounding box
+        // transform bounding box from global to local coordinates
+        b.min() &=  NodeT::DIM-1u;
+        b.min() >>= NodeT::ChildNodeType::TOTAL;
+        b.max() &=  NodeT::DIM-1u;
+        b.max() >>= NodeT::ChildNodeType::TOTAL;
+        assert( !b.empty() );
+        auto it = b.begin();// iterates over all the child nodes or tiles that intersects bbox
+        for (const Coord& ijk = *it; it; ++it) {
+            mask.setOn(ijk[2] + (ijk[1] << NodeT::LOG2DIM) + (ijk[0] << 2*NodeT::LOG2DIM));
+        }
+    }
+    return mask;
+}// getBBoxMask
+
+}// end of unnamed namespace
+
+/// @brief return the extrema of all the values in a grid that
+///        intersects the specified bounding box.
+template<typename BuildT>
+Extrema<typename NanoGrid<BuildT>::ValueType>
+getExtrema(const NanoGrid<BuildT>& grid, const CoordBBox &bbox)
+{
+    using GridT  = NanoGrid<BuildT>;
+    using ValueT = typename GridT::ValueType;
+    using TreeT = typename GridTree<GridT>::type;
+    using RootT = typename NodeTrait<TreeT, 3>::type;// root node
+    using Node2 = typename NodeTrait<TreeT, 2>::type;// upper internal node
+    using Node1 = typename NodeTrait<TreeT, 1>::type;// lower internal node
+    using Node0 = typename NodeTrait<TreeT, 0>::type;// leaf node
+
+    Extrema<ValueT> extrema;
+    const RootT &root = grid.tree().root();
+    const auto &bbox3 = root.bbox();
+    if (bbox.isInside(bbox3)) {// bbox3 is contained inside bbox
+        extrema.min(root.minimum());
+        extrema.max(root.maximum());
+        extrema.add(root.background());
+    } else if (bbox.hasOverlap(bbox3)) {
+        const auto *data3 = root.data();
+        for (uint32_t i=0; i<data3->mTableSize; ++i) {
+            const auto *tile = data3->tile(i);
+            CoordBBox bbox2 = CoordBBox::createCube(tile->origin(), Node2::dim());
+            if (!bbox.hasOverlap(bbox2)) continue;
+            if (tile->isChild()) {
+                const Node2 *node2 = data3->getChild(tile);
+                if (bbox.isInside(bbox2)) {
+                    extrema.min(node2->minimum());
+                    extrema.max(node2->maximum());
+                } else {// partial intersections at level 2
+                    auto *data2 = node2->data();
+                    const auto bboxMask2 = getBBoxMask(bbox, node2);
+                    for (auto it2 = bboxMask2.beginOn(); it2; ++it2) {
+                        if (data2->mChildMask.isOn(*it2)) {
+                            const Node1* node1 = data2->getChild(*it2);
+                            CoordBBox bbox1 = CoordBBox::createCube(node1->origin(), Node1::dim());
+                            if (bbox.isInside(bbox1)) {
+                                extrema.min(node1->minimum());
+                                extrema.max(node1->maximum());
+                            } else {// partial intersection at level 1
+                                auto *data1 = node1->data();
+                                const auto bboxMask1 = getBBoxMask(bbox, node1);
+                                for (auto it1 = bboxMask1.beginOn(); it1; ++it1) {
+                                    if (data1->mChildMask.isOn(*it1)) {
+                                        const Node0* node0 = data1->getChild(*it1);
+                                        CoordBBox bbox0 = CoordBBox::createCube(node0->origin(), Node0::dim());
+                                        if (bbox.isInside(bbox0)) {
+                                            extrema.min(node0->minimum());
+                                            extrema.max(node0->maximum());
+                                        } else {// partial intersection at level 0
+                                            auto *data0 = node0->data();
+                                            const auto bboxMask0 = getBBoxMask(bbox, node0);
+                                            for (auto it0 = bboxMask0.beginOn(); it0; ++it0) {
+                                                extrema.add(data0->getValue(*it0));
+                                            }
+                                        }// end partial intersection at level 0
+                                    } else {// tile at level 1
+                                        extrema.add(data1->mTable[*it1].value);
+                                    }
+                                }
+                            }// end of partial intersection at level 1
+                        } else {// tile at level 2
+                           extrema.add(data2->mTable[*it2].value);
+                        }
+                    }// loop over tiles and nodes at level 2
+                }// end of partial intersection at level 1
+            } else {// tile at root level
+                extrema.add(tile->value);
+            }
+        }// loop over root table
+    } else {// bbox does not overlap the grid
+        extrema.add(root.background());
+    }
+    return extrema;
+}// getExtrema
+
+}// namespace tools
+
+} // namespace nanovdb
+
+#endif // NANOVDB_TOOLS_GRIDSTATS_H_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/tools/GridValidator.h b/nanovdb/nanovdb/tools/GridValidator.h
new file mode 100644
index 0000000000..fbc4e14ded
--- /dev/null
+++ b/nanovdb/nanovdb/tools/GridValidator.h
@@ -0,0 +1,244 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    \file nanovdb/tools/GridValidator.h
+
+    \author Ken Museth
+
+    \date August 30, 2020
+
+    \brief Checks the validity of an existing NanoVDB grid.
+
+    \note before v32.6.0: checksum[0] = Grid+Tree+Root, checksum[1] = nodes
+          after  v32.6.0: checksum[0] = Grid+Tree,      checksum[1] = nodes + blind data in 4K blocks
+
+    When serialized:
+    [Grid,Tree][Root][ROOT TILES...][Node<5>...][Node<4>...][Leaf<3>...][BlindMeta...][BlindData...]
+*/
+
+#ifndef NANOVDB_TOOLS_GRID_VALIDATOR_H_HAS_BEEN_INCLUDED
+#define NANOVDB_TOOLS_GRID_VALIDATOR_H_HAS_BEEN_INCLUDED
+
+#include <iostream> // for std::cerr
+
+#include <nanovdb/NanoVDB.h>
+#include <nanovdb/tools/GridChecksum.h>
+
+namespace nanovdb {
+
+namespace tools {
+
+/// @brief Performs several validation tests on a grid pointer.
+/// @tparam ValueT Build type of the input grid
+/// @param grid const point to the grid that needs validation
+/// @param mode Mode of the validation check (defined in GridChecksum.h)
+/// @param verbose If true information about the first failed test is printed to std::cerr
+/// @return Return true if the specified grid passes several validation tests.
+template <typename ValueT>
+bool isValid(const NanoGrid<ValueT> *grid, CheckMode mode, bool verbose = false);
+
+/// @brief Return true if the specified grid passes several validation tests.
+/// @tparam ValueT Build type of the input grid
+/// @param grid Grid to validate
+/// @param detailed If true the validation test is detailed and relatively slow.
+/// @param verbose If true information about the first failed test is printed to std::cerr
+/// @note This method has been deprecated by the one defined above
+template <typename ValueT>
+[[deprecated("Use isValue(const NanoGrid<ValueT>*, CheckMode, bool) instead.")]]
+bool isValid(const NanoGrid<ValueT> &grid, bool detailed = true, bool verbose = false)
+{
+    return isValid(&grid, detailed ? CheckMode::Full : CheckMode::Half, verbose);
+}
+
+//================================================================================================
+
+/// @brief validate grid
+template<typename ValueT>
+__hostdev__ char* checkGrid(const NanoGrid<ValueT> *grid, char *error, CheckMode mode = CheckMode::Full)
+{
+    *error = '\0';// reset error string
+    char str[32];// temporary buffer for toStr
+
+    // check Grid
+    if (grid == nullptr) {
+        return util::sprint(error, "Invalid pointer: Grid is NULL");
+    } else if (!isAligned(grid)) {
+        return util::sprint(error, "Invalid pointer: Grid is misaligned");
+    } else if (grid->mMagic != NANOVDB_MAGIC_NUMB && grid->mMagic != NANOVDB_MAGIC_GRID) {
+        return util::sprint(error, "Invalid magic number: ", toStr(str, toMagic(grid->mMagic)));
+    } else if (!grid->mVersion.isCompatible()) {
+        return util::sprint(error, "Incompatible version number: ", toStr(str, grid->mVersion));
+    } else if (grid->mGridCount == 0) {
+        return util::sprint(error, "Zero grid count");
+    } else if (grid->mGridIndex >= grid->mGridCount) {
+        return util::sprint(error, "grid index(", int(grid->mGridIndex), ") >= grid count(", int(grid->mGridCount), ")");
+    } else if (grid->mGridClass >= GridClass::End) {
+        return util::sprint(error, "Invalid GridClass(", toStr(str, grid->mGridClass), ")");
+     } else if (grid->mGridType >= GridType::End) {
+        return util::sprint(error, "Invalid GridType(", toStr(str, grid->mGridType), ")");
+     } else if (grid->mGridType != toGridType<ValueT>()) {
+        return util::sprint(error, "Invalid combination of BuildType(", toStr(str, toGridType<ValueT>()), ") and GridType(", toStr(str+16, grid->mGridType), ")");
+    } else if (!isValid(grid->mGridType, grid->mGridClass)) {
+        return util::sprint(error, "Invalid combination of GridType(", toStr(str, grid->mGridType), ") and GridClass(", toStr(str+16,grid->mGridClass), ")");
+    }
+
+    // check Tree
+    auto &tree = grid->tree();
+    if (auto *p = tree.getRoot()) {
+        if (!isAligned(p)) return util::strcpy(error, "Invalid pointer: Root is misaligned");
+    } else {
+        return util::strcpy(error, "Invalid pointer: Root is NULL");
+    }
+
+    // check Root
+    auto &root = tree.root();
+    auto *rootData = root.data();
+    if (rootData == nullptr) {
+        return util::strcpy(error, "Invalid pointer: Root is NULL");
+    } else if (!isAligned((const void*)rootData)) {
+        return util::strcpy(error, "Invalid pointer: Root is misaligned");
+    } else if ( (const uint8_t*)(rootData) < (const uint8_t*)(&tree+1)) {
+       return util::strcpy(error, "Invalid root pointer (should be located after the Grid and Tree)");
+    } else if ( (const void*)(rootData) > util::PtrAdd(rootData, root.memUsage())) {
+       return util::strcpy(error, "Invalid root pointer (appears to be located after the end of the buffer)");
+    } else {// check root tiles
+        const void *bounds[2] = {rootData + 1, util::PtrAdd(rootData, root.memUsage())};
+        for (uint32_t i = 0; i<rootData->mTableSize; ++i) {
+            const void *tile = rootData->tile(i);
+            if ( tile < bounds[0] ) {
+                return util::strcpy(error, "Invalid root tile pointer (below lower bound");
+            } else if (tile >= bounds[1]) {
+                return util::strcpy(error, "Invalid root tile pointer (above higher bound");
+            }
+        }
+    }
+    if (mode == CheckMode::Half) return error;
+
+    // check nodes
+    const bool test = grid->isBreadthFirst();
+    auto *n0 = tree.template getFirstNode<0>();
+    auto *n1 = tree.template getFirstNode<1>();
+    auto *n2 = tree.template getFirstNode<2>();
+    const void *bounds[3][2] = {{n0, util::PtrAdd(n0, grid->gridSize())}, {n1, n0}, {n2, n1}};
+
+    auto check = [&](const void *ptr, int level) -> bool {
+        if (ptr==nullptr) {
+            util::strcpy(error, "Invalid node pointer: node is NULL");
+        } else if (!isAligned(ptr)) {
+            util::strcpy(error, "Invalid node pointer: node is misaligned");
+        } else if (test && level == 0 && (const void*)(n0++) != ptr) {
+            util::strcpy(error, "Leaf node is not stored breadth-first");
+        } else if (test && level == 1 && (const void*)(n1++) != ptr) {
+            util::strcpy(error, "Lower node is not stored breadth-first");
+        } else if (test && level == 2 && (const void*)(n2++) != ptr) {
+            util::strcpy(error, "Upper node is not stored breadth-first");
+        } else if ( ptr < bounds[level][0] ) {
+            util::strcpy(error, "Invalid node pointer: below lower bound");
+        } else if ( ptr >= bounds[level][1] ) {
+            util::strcpy(error, "Invalid node pointer: above higher bound");
+        }
+        return !util::empty(error);
+    };
+
+    for (auto it2 = root.cbeginChild(); it2; ++it2) {
+        if (check(&*it2, 2)) return error;
+        for (auto it1 = it2->cbeginChild(); it1; ++it1) {
+            if (check(&*it1, 1)) return error;
+            for (auto it0 = it1->cbeginChild(); it0; ++it0) if (check(&*it0, 0)) return error;
+        }// loop over child nodes of the upper internal node
+    }// loop over child nodes of the root node
+
+    return error;
+} // checkGrid
+
+//================================================================================================
+
+template <typename ValueT>
+bool isValid(const NanoGrid<ValueT> *grid, CheckMode mode, bool verbose)
+{
+    std::unique_ptr<char[]> strUP(new char[100]);
+    char *str = strUP.get();
+
+    tools::checkGrid(grid, str, mode);
+
+    if (util::empty(str) && !validateChecksum(grid, mode)) util::strcpy(str, "Mis-matching checksum");
+    if (verbose && !util::empty(str)) std::cerr << "Validation failed: " << str << std::endl;
+
+    return util::empty(str);
+}// isValid
+
+//================================================================================================
+
+struct IsNanoGridValid {
+    template <typename BuildT>
+    static bool   known(const GridData *gridData, CheckMode mode, bool verbose)
+    {
+        return tools::isValid((const NanoGrid<BuildT>*)gridData, mode, verbose);
+    }
+    static bool unknown(const GridData *gridData, CheckMode, bool verbose)
+    {
+        if (verbose) {
+            char str[16];
+            std::cerr << "Unsupported GridType: \"" << toStr(str,  gridData->mGridType) << "\"\n" << std::endl;
+        }
+        return false;
+    }
+};// IsNanoGridValid
+
+/// @brief Validate a specific grid in a GridHandle
+/// @tparam GridHandleT Type of GridHandle
+/// @param handle GridHandle containing host grids
+/// @param gridID linear index of the grid to be validated
+/// @param mode node of validation tests
+/// @param verbose if true information is printed if the grid fails a validation test
+/// @return true if grid @c gridID passes all the validation tests
+template <typename GridHandleT>
+bool validateGrid(const GridHandleT &handle, uint32_t gridID, CheckMode mode, bool verbose)
+{
+    if (mode == CheckMode::Disable) {
+        return true;
+    } else if (gridID >= handle.gridCount()) {
+        if (verbose) std::cerr << "grid index " << gridID << " exceeds available grid count " << handle.gridCount() << std::endl;
+        return false;
+    }
+    return callNanoGrid<IsNanoGridValid>(handle.gridData(gridID), mode, verbose);
+}// validateGrid
+
+//================================================================================================
+
+/// @brief Validate all the grids in a GridHandle
+/// @tparam GridHandleT Type of GridHandle
+/// @param handle GridHandle containing host grids (0,1...,N)
+/// @param mode node of validation tests
+/// @param verbose if true information is printed if a grid fails a validation test
+/// @return true if all grids pass alle the validation tests
+template <typename GridHandleT>
+bool validateGrids(const GridHandleT &handle, CheckMode mode, bool verbose)
+{
+    if (mode == CheckMode::Disable) return true;
+    for (uint32_t gridID=0; gridID<handle.gridCount(); ++gridID) {
+        if (!validateGrid(handle, gridID, mode, verbose)) return false;
+    }
+    return true;
+}// validateGrids
+
+}// namespace tools
+
+template<typename ValueT>
+[[deprecated("Use nanovdb:tools::checkGrid instead.")]]
+__hostdev__ char* checkGrid(const NanoGrid<ValueT> *grid, char *error, CheckMode mode = CheckMode::Full)
+{
+    return tools::checkGrid<ValueT>(grid, error, mode);
+}
+
+template <typename ValueT>
+[[deprecated("Use nanovdb:tools::isValid instead.")]]
+bool isValid(const NanoGrid<ValueT> *grid, CheckMode mode, bool verbose = false)
+{
+    return tools::isValid<ValueT>(grid, mode, verbose);
+}
+
+}// namespace nanovdb
+
+#endif // NANOVDB_TOOLS_GRID_VALIDATOR_H_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/tools/NanoToOpenVDB.h b/nanovdb/nanovdb/tools/NanoToOpenVDB.h
new file mode 100644
index 0000000000..3723a4ecb1
--- /dev/null
+++ b/nanovdb/nanovdb/tools/NanoToOpenVDB.h
@@ -0,0 +1,366 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    \file nanovdb/tools/NanoToOpenVDB.h
+
+    \author Ken Museth
+
+    \date May 6, 2020
+
+    \brief This class will deserialize an NanoVDB grid into an OpenVDB grid.
+
+    \todo Add support for PointIndexGrid and PointDataGrid
+*/
+
+#include <nanovdb/NanoVDB.h> // manages and streams the raw memory buffer of a NanoVDB grid.
+#include <nanovdb/GridHandle.h>
+#include <nanovdb/util/ForEach.h>
+
+#include <openvdb/openvdb.h>
+
+#ifndef NANOVDB_TOOLS_NANOTOOPENVDB_H_HAS_BEEN_INCLUDED
+#define NANOVDB_TOOLS_NANOTOOPENVDB_H_HAS_BEEN_INCLUDED
+
+template<typename T>
+struct ConvertTrait {using Type = T;};
+
+template<typename T>
+struct ConvertTrait<nanovdb::math::Vec3<T>> {using Type = openvdb::math::Vec3<T>;};
+
+template<typename T>
+struct ConvertTrait<nanovdb::math::Vec4<T>> {using Type = openvdb::math::Vec4<T>;};
+
+template<>
+struct ConvertTrait<nanovdb::Fp4> {using Type = float;};
+
+template<>
+struct ConvertTrait<nanovdb::Fp8> {using Type = float;};
+
+template<>
+struct ConvertTrait<nanovdb::Fp16> {using Type = float;};
+
+template<>
+struct ConvertTrait<nanovdb::FpN> {using Type = float;};
+
+template<>
+struct ConvertTrait<nanovdb::ValueMask> {using Type = openvdb::ValueMask;};
+
+namespace nanovdb {
+
+namespace tools {
+
+/// @brief Forward declaration of free-standing function that de-serializes a typed NanoVDB grid into an OpenVDB Grid
+template<typename NanoBuildT>
+typename openvdb::Grid<typename openvdb::tree::Tree4<typename ConvertTrait<NanoBuildT>::Type>::Type>::Ptr
+nanoToOpenVDB(const NanoGrid<NanoBuildT>& grid, int verbose = 0);
+
+/// @brief Forward declaration of free-standing function that de-serializes a NanoVDB GridHandle into an OpenVDB GridBase
+template<typename BufferT>
+openvdb::GridBase::Ptr
+nanoToOpenVDB(const GridHandle<BufferT>& handle, int verbose = 0, uint32_t n = 0);
+
+/// @brief This class will serialize an OpenVDB grid into a NanoVDB grid managed by a GridHandle.
+template<typename NanoBuildT>
+class NanoToOpenVDB
+{
+    using NanoNode0  = nanovdb::LeafNode<NanoBuildT, openvdb::Coord, openvdb::util::NodeMask>; // note that it's using openvdb coord nd mask types!
+    using NanoNode1  = nanovdb::InternalNode<NanoNode0>;
+    using NanoNode2  = nanovdb::InternalNode<NanoNode1>;
+    using NanoRootT  = nanovdb::RootNode<NanoNode2>;
+    using NanoTreeT  = nanovdb::Tree<NanoRootT>;
+    using NanoGridT  = nanovdb::Grid<NanoTreeT>;
+    using NanoValueT = typename NanoGridT::ValueType;
+
+    using OpenBuildT = typename ConvertTrait<NanoBuildT>::Type; // e.g. float -> float but nanovdb::math::Vec3<float> -> openvdb::Vec3<float>
+    using OpenNode0  = openvdb::tree::LeafNode<OpenBuildT, NanoNode0::LOG2DIM>; // leaf
+    using OpenNode1  = openvdb::tree::InternalNode<OpenNode0, NanoNode1::LOG2DIM>; // lower
+    using OpenNode2  = openvdb::tree::InternalNode<OpenNode1, NanoNode2::LOG2DIM>; // upper
+    using OpenRootT  = openvdb::tree::RootNode<OpenNode2>;
+    using OpenTreeT  = openvdb::tree::Tree<OpenRootT>;
+    using OpenGridT  = openvdb::Grid<OpenTreeT>;
+    using OpenValueT = typename OpenGridT::ValueType;
+
+public:
+    /// @brief Construction from an existing const OpenVDB Grid.
+    NanoToOpenVDB(){};
+
+    /// @brief Return a shared pointer to a NanoVDB grid constructed from the specified OpenVDB grid
+    typename OpenGridT::Ptr operator()(const NanoGrid<NanoBuildT>& grid, int verbose = 0);
+
+private:
+
+    template<typename NanoNodeT, typename OpenNodeT>
+    OpenNodeT* processNode(const NanoNodeT*);
+
+    OpenNode2* process(const NanoNode2* node) {return this->template processNode<NanoNode2, OpenNode2>(node);}
+    OpenNode1* process(const NanoNode1* node) {return this->template processNode<NanoNode1, OpenNode1>(node);}
+
+    template <typename NanoLeafT>
+    typename std::enable_if<!std::is_same<bool, typename NanoLeafT::BuildType>::value &&
+                            !std::is_same<ValueMask, typename NanoLeafT::BuildType>::value &&
+                            !std::is_same<Fp4, typename NanoLeafT::BuildType>::value &&
+                            !std::is_same<Fp8, typename NanoLeafT::BuildType>::value &&
+                            !std::is_same<Fp16,typename NanoLeafT::BuildType>::value &&
+                            !std::is_same<FpN, typename NanoLeafT::BuildType>::value,
+                            OpenNode0*>::type
+    process(const NanoLeafT* node);
+
+    template <typename NanoLeafT>
+    typename std::enable_if<std::is_same<Fp4, typename NanoLeafT::BuildType>::value ||
+                            std::is_same<Fp8, typename NanoLeafT::BuildType>::value ||
+                            std::is_same<Fp16,typename NanoLeafT::BuildType>::value ||
+                            std::is_same<FpN, typename NanoLeafT::BuildType>::value,
+                            OpenNode0*>::type
+    process(const NanoLeafT* node);
+
+    template <typename NanoLeafT>
+    typename std::enable_if<std::is_same<ValueMask, typename NanoLeafT::BuildType>::value,
+                            OpenNode0*>::type
+    process(const NanoLeafT* node);
+
+    template <typename NanoLeafT>
+    typename std::enable_if<std::is_same<bool, typename NanoLeafT::BuildType>::value,
+                            OpenNode0*>::type
+    process(const NanoLeafT* node);
+
+    /// converts nanovdb value types to openvdb value types, e.g. nanovdb::Vec3f& -> openvdb::Vec3f&
+    static const OpenValueT& Convert(const NanoValueT &v) {return reinterpret_cast<const OpenValueT&>(v);}
+    static const OpenValueT* Convert(const NanoValueT *v) {return reinterpret_cast<const OpenValueT*>(v);}
+
+}; // NanoToOpenVDB class
+
+template<typename NanoBuildT>
+typename NanoToOpenVDB<NanoBuildT>::OpenGridT::Ptr
+NanoToOpenVDB<NanoBuildT>::operator()(const NanoGrid<NanoBuildT>& grid, int /*verbose*/)
+{
+    // since the input nanovdb grid might use nanovdb types (Coord, Mask, Vec3) we cast to use openvdb types
+    const NanoGridT *srcGrid = reinterpret_cast<const NanoGridT*>(&grid);
+
+    auto dstGrid = openvdb::createGrid<OpenGridT>(Convert(srcGrid->tree().background()));
+    dstGrid->setName(srcGrid->gridName()); // set grid name
+    switch (srcGrid->gridClass()) { // set grid class
+    case nanovdb::GridClass::LevelSet:
+        dstGrid->setGridClass(openvdb::GRID_LEVEL_SET);
+        break;
+    case nanovdb::GridClass::FogVolume:
+        dstGrid->setGridClass(openvdb::GRID_FOG_VOLUME);
+        break;
+    case nanovdb::GridClass::Staggered:
+        dstGrid->setGridClass(openvdb::GRID_STAGGERED);
+        break;
+    case nanovdb::GridClass::PointIndex:
+        throw std::runtime_error("NanoToOpenVDB does not yet support PointIndexGrids");
+    case nanovdb::GridClass::PointData:
+        throw std::runtime_error("NanoToOpenVDB does not yet support PointDataGrids");
+    default:
+        dstGrid->setGridClass(openvdb::GRID_UNKNOWN);
+    }
+    // set transform
+    const nanovdb::Map& nanoMap = reinterpret_cast<const GridData*>(srcGrid)->mMap;
+    auto                mat = openvdb::math::Mat4<double>::identity();
+    mat.setMat3(openvdb::math::Mat3<double>(nanoMap.mMatD));
+    mat.transpose(); // the 3x3 in nanovdb is transposed relative to openvdb's 3x3
+    mat.setTranslation(openvdb::math::Vec3<double>(nanoMap.mVecD));
+    dstGrid->setTransform(openvdb::math::Transform::createLinearTransform(mat)); // calls simplify!
+
+    // process root node
+    auto &root = dstGrid->tree().root();
+    auto *data = srcGrid->tree().root().data();
+    for (uint32_t i=0; i<data->mTableSize; ++i) {
+        auto *tile = data->tile(i);
+        if (tile->isChild()) {
+            root.addChild( this->process( data->getChild(tile)) );
+        } else {
+            root.addTile(tile->origin(), Convert(tile->value), tile->state);
+        }
+    }
+
+    return dstGrid;
+}
+
+template<typename T>
+template<typename SrcNodeT, typename DstNodeT>
+DstNodeT*
+NanoToOpenVDB<T>::processNode(const SrcNodeT *srcNode)
+{
+    DstNodeT *dstNode = new DstNodeT(); // un-initialized for fast construction
+    dstNode->setOrigin(srcNode->origin());
+    const auto& childMask = srcNode->childMask();
+    const_cast<typename DstNodeT::NodeMaskType&>(dstNode->getValueMask()) = srcNode->valueMask();
+    const_cast<typename DstNodeT::NodeMaskType&>(dstNode->getChildMask()) = childMask;
+    auto* dstTable = const_cast<typename DstNodeT::UnionType*>(dstNode->getTable());
+    auto* srcData  = srcNode->data();
+    std::vector<std::pair<uint32_t, const typename SrcNodeT::ChildNodeType*>> childNodes;
+    const auto childCount = childMask.countOn();
+    childNodes.reserve(childCount);
+    for (uint32_t n = 0; n < DstNodeT::NUM_VALUES; ++n) {
+        if (childMask.isOn(n)) {
+            childNodes.emplace_back(n, srcData->getChild(n));
+        } else {
+            dstTable[n].setValue(Convert(srcData->mTable[n].value));
+        }
+    }
+    auto kernel = [&](const auto& r) {
+        for (auto i = r.begin(); i != r.end(); ++i) {
+            auto &p = childNodes[i];
+            dstTable[p.first].setChild( this->process(p.second) );
+        }
+    };
+
+#if 0
+    kernel(Range1D(0, childCount));
+#else
+    util::forEach(0, childCount, 1, kernel);
+#endif
+    return dstNode;
+} // processNode
+
+template<typename T>
+template <typename NanoLeafT>
+inline typename std::enable_if<!std::is_same<bool, typename NanoLeafT::BuildType>::value &&
+                               !std::is_same<ValueMask, typename NanoLeafT::BuildType>::value &&
+                               !std::is_same<Fp4, typename NanoLeafT::BuildType>::value &&
+                               !std::is_same<Fp8, typename NanoLeafT::BuildType>::value &&
+                               !std::is_same<Fp16,typename NanoLeafT::BuildType>::value &&
+                               !std::is_same<FpN, typename NanoLeafT::BuildType>::value,
+                               typename NanoToOpenVDB<T>::OpenNode0*>::type
+NanoToOpenVDB<T>::process(const NanoLeafT *srcNode)
+{
+    static_assert(std::is_same<NanoLeafT, NanoNode0>::value, "NanoToOpenVDB<FpN>::process assert failed");
+    OpenNode0* dstNode = new OpenNode0(); // un-initialized for fast construction
+    dstNode->setOrigin(srcNode->origin());
+    dstNode->setValueMask(srcNode->valueMask());
+
+    const auto* src = Convert(srcNode->data()->mValues);// doesn't work for compressed data, bool or ValueMask
+    for (auto *dst = dstNode->buffer().data(), *end = dst + OpenNode0::SIZE; dst != end; dst += 4, src += 4) {
+        dst[0] = src[0];
+        dst[1] = src[1];
+        dst[2] = src[2];
+        dst[3] = src[3];
+    }
+
+    return dstNode;
+} // process(NanoNode0)
+
+template<typename T>
+template <typename NanoLeafT>
+inline typename std::enable_if<std::is_same<Fp4, typename NanoLeafT::BuildType>::value ||
+                               std::is_same<Fp8, typename NanoLeafT::BuildType>::value ||
+                               std::is_same<Fp16,typename NanoLeafT::BuildType>::value ||
+                               std::is_same<FpN, typename NanoLeafT::BuildType>::value,
+                               typename NanoToOpenVDB<T>::OpenNode0*>::type
+NanoToOpenVDB<T>::process(const NanoLeafT *srcNode)
+{
+    static_assert(std::is_same<NanoLeafT, NanoNode0>::value, "NanoToOpenVDB<T>::process assert failed");
+    OpenNode0* dstNode = new OpenNode0(); // un-initialized for fast construction
+    dstNode->setOrigin(srcNode->origin());
+    dstNode->setValueMask(srcNode->valueMask());
+    float *dst = dstNode->buffer().data();
+    for (int i=0; i!=512; i+=4) {
+        *dst++ = srcNode->getValue(i);
+        *dst++ = srcNode->getValue(i+1);
+        *dst++ = srcNode->getValue(i+2);
+        *dst++ = srcNode->getValue(i+3);
+    }
+
+    return dstNode;
+} // process(NanoNode0)
+
+template<typename T>
+template <typename NanoLeafT>
+inline typename std::enable_if<std::is_same<ValueMask, typename NanoLeafT::BuildType>::value,
+                               typename NanoToOpenVDB<T>::OpenNode0*>::type
+NanoToOpenVDB<T>::process(const NanoLeafT *srcNode)
+{
+    static_assert(std::is_same<NanoLeafT, NanoNode0>::value, "NanoToOpenVDB<ValueMask>::process assert failed");
+    OpenNode0* dstNode = new OpenNode0(); // un-initialized for fast construction
+    dstNode->setOrigin(srcNode->origin());
+    dstNode->setValueMask(srcNode->valueMask());
+
+    return dstNode;
+} // process(NanoNode0)
+
+template<typename T>
+template <typename NanoLeafT>
+inline typename std::enable_if<std::is_same<bool, typename NanoLeafT::BuildType>::value,
+                               typename NanoToOpenVDB<T>::OpenNode0*>::type
+NanoToOpenVDB<T>::process(const NanoLeafT *srcNode)
+{
+    static_assert(std::is_same<NanoLeafT, NanoNode0>::value, "NanoToOpenVDB<ValueMask>::process assert failed");
+    OpenNode0* dstNode = new OpenNode0(); // un-initialized for fast construction
+    dstNode->setOrigin(srcNode->origin());
+    dstNode->setValueMask(srcNode->valueMask());
+    reinterpret_cast<openvdb::util::NodeMask<3>&>(dstNode->buffer()) = srcNode->data()->mValues;
+
+    return dstNode;
+} // process(NanoNode0)
+
+template<typename NanoBuildT>
+inline typename openvdb::Grid<typename openvdb::tree::Tree4<typename ConvertTrait<NanoBuildT>::Type>::Type>::Ptr
+nanoToOpenVDB(const NanoGrid<NanoBuildT>& grid, int verbose)
+{
+    NanoToOpenVDB<NanoBuildT> tmp;
+    return tmp(grid, verbose);
+}
+
+template<typename BufferT>
+openvdb::GridBase::Ptr
+nanoToOpenVDB(const GridHandle<BufferT>& handle, int verbose, uint32_t n)
+{
+    if (auto grid = handle.template grid<float>(n)) {
+        return tools::nanoToOpenVDB(*grid, verbose);
+    } else if (auto grid = handle.template grid<double>(n)) {
+        return tools::nanoToOpenVDB(*grid, verbose);
+    } else if (auto grid = handle.template grid<int32_t>(n)) {
+        return tools::nanoToOpenVDB(*grid, verbose);
+    } else if (auto grid = handle.template grid<int64_t>(n)) {
+        return tools::nanoToOpenVDB(*grid, verbose);
+    } else if (auto grid = handle.template grid<bool>(n)) {
+        return tools::nanoToOpenVDB(*grid, verbose);
+    } else if (auto grid = handle.template grid<nanovdb::Fp4>(n)) {
+        return tools::nanoToOpenVDB(*grid, verbose);
+    } else if (auto grid = handle.template grid<nanovdb::Fp8>(n)) {
+        return tools::nanoToOpenVDB(*grid, verbose);
+    } else if (auto grid = handle.template grid<nanovdb::Fp16>(n)) {
+        return tools::nanoToOpenVDB(*grid, verbose);
+    } else if (auto grid = handle.template grid<nanovdb::FpN>(n)) {
+        return tools::nanoToOpenVDB(*grid, verbose);
+    } else if (auto grid = handle.template grid<nanovdb::ValueMask>(n)) {
+        return tools::nanoToOpenVDB(*grid, verbose);
+    } else if (auto grid = handle.template grid<nanovdb::Vec3f>(n)) {
+        return tools::nanoToOpenVDB(*grid, verbose);
+    } else if (auto grid = handle.template grid<nanovdb::Vec3d>(n)) {
+        return tools::nanoToOpenVDB(*grid, verbose);
+    } else if (auto grid = handle.template grid<nanovdb::Vec4f>(n)) {
+        return tools::nanoToOpenVDB(*grid, verbose);
+    } else if (auto grid = handle.template grid<nanovdb::Vec4d>(n)) {
+        return tools::nanoToOpenVDB(*grid, verbose);
+    } else {
+        OPENVDB_THROW(openvdb::RuntimeError, "Unsupported NanoVDB grid type!");
+    }
+}// tools::nanoToOpenVDB
+
+}// namespace tools
+
+/// @brief Forward declaration of free-standing function that de-serializes a typed NanoVDB grid into an OpenVDB Grid
+template<typename NanoBuildT>
+[[deprecated("Use nanovdb::tools::nanoToOpenVDB instead.")]]
+typename openvdb::Grid<typename openvdb::tree::Tree4<typename ConvertTrait<NanoBuildT>::Type>::Type>::Ptr
+nanoToOpenVDB(const NanoGrid<NanoBuildT>& grid, int verbose = 0)
+{
+    return tools::nanoToOpenVDB(grid, verbose);
+}
+
+/// @brief Forward declaration of free-standing function that de-serializes a NanoVDB GridHandle into an OpenVDB GridBase
+template<typename BufferT>
+[[deprecated("Use nanovdb::tools::nanoToOpenVDB instead.")]]
+openvdb::GridBase::Ptr
+nanoToOpenVDB(const GridHandle<BufferT>& handle, int verbose = 0, uint32_t n = 0)
+{
+    return tools::nanoToOpenVDB(handle, verbose, n);
+}
+
+} // namespace nanovdb
+
+#endif // NANOVDB_TOOLS_NANOTOOPENVDB_H_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/tools/cuda/AddBlindData.cuh b/nanovdb/nanovdb/tools/cuda/AddBlindData.cuh
new file mode 100644
index 0000000000..a7dcfcd6f5
--- /dev/null
+++ b/nanovdb/nanovdb/tools/cuda/AddBlindData.cuh
@@ -0,0 +1,146 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    \file nanovdb/tools/cuda/AddBlindData.cuh
+
+    \author Ken Museth
+
+    \date August 3, 2023
+
+    \brief Defines function that appends blind device data to and existing device NanoGrid
+
+    \warning The header file contains cuda device code so be sure
+             to only include it in .cu files (or other .cuh files)
+*/
+
+#ifndef NVIDIA_TOOLS_CUDA_ADDBLINDDATA_CUH_HAS_BEEN_INCLUDED
+#define NVIDIA_TOOLS_CUDA_ADDBLINDDATA_CUH_HAS_BEEN_INCLUDED
+
+#include <nanovdb/NanoVDB.h>
+#include <nanovdb/cuda/DeviceBuffer.h>
+#include <nanovdb/GridHandle.h>
+#include <nanovdb/util/cuda/Util.h>
+#include <nanovdb/tools/GridChecksum.h>
+#include <nanovdb/tools/cuda/GridChecksum.cuh>
+
+#include <cstring> // for std::strcpy
+
+namespace nanovdb {// ================================================
+
+namespace tools::cuda {// ============================================
+
+/// @brief This function appends blind data to and existing NanoGrid
+/// @tparam BuildT Build type of the grid
+/// @tparam BlindDataT Type of the blind data
+/// @tparam BufferT Type of the buffer used for allocation
+/// @param d_grid Pointer to device grid
+/// @param d_blindData Pointer to device blind data
+/// @param valueCount number of values in the blind data
+/// @param blindClass class of the blind data
+/// @param semantics semantics of the blind data
+/// @param name optional name of the blind data
+/// @param pool optional pool used for allocation
+/// @param stream optional CUDA stream (defaults to CUDA stream 0)
+/// @return GridHandle with blind data appended
+template<typename BuildT, typename BlindDataT, typename BufferT = nanovdb::cuda::DeviceBuffer>
+GridHandle<BufferT>
+addBlindData(const NanoGrid<BuildT> *d_grid,
+             const BlindDataT *d_blindData,
+             uint64_t valueCount,
+             GridBlindDataClass blindClass   = GridBlindDataClass::Unknown,
+             GridBlindDataSemantic semantics = GridBlindDataSemantic::Unknown,
+             const char *name = "",
+             const BufferT &pool = BufferT(),
+             cudaStream_t stream = 0)
+{
+    // In:  |-----------|--------- |-----------|
+    //        old grid    old meta   old data
+    // Out: |-----------|----------|----------|-----------|------------|
+    //        old grid    old meta   new meta    old data    new data
+
+    static_assert(BufferTraits<BufferT>::hasDeviceDual, "Expected BufferT to support device allocation");
+
+    // extract byte sizes of the grid, blind meta data and blind data
+    enum {GRID=0, META=1, DATA=2, CHECKSUM=3};
+    uint64_t tmp[4], *d_tmp;
+    cudaCheck(util::cuda::mallocAsync((void**)&d_tmp, 4*sizeof(uint64_t), stream));
+    util::cuda::lambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {
+        if (auto count  = d_grid->blindDataCount()) {
+            d_tmp[GRID] = util::PtrDiff(&d_grid->blindMetaData(0), d_grid);
+            d_tmp[META] = count*sizeof(GridBlindMetaData);
+            d_tmp[DATA] = d_grid->gridSize() - d_tmp[GRID] - d_tmp[META];
+        } else {
+            d_tmp[GRID] = d_grid->gridSize();
+            d_tmp[META] = d_tmp[DATA] = 0u;
+        }
+        d_tmp[CHECKSUM] = d_grid->checksum().full();
+    }); cudaCheckError();
+    cudaCheck(cudaMemcpyAsync(&tmp, d_tmp, 4*sizeof(uint64_t), cudaMemcpyDeviceToHost, stream));
+
+    GridBlindMetaData metaData{int64_t(sizeof(GridBlindMetaData) + tmp[DATA]), valueCount,
+                               sizeof(BlindDataT), semantics, blindClass, toGridType<BlindDataT>()};
+    if (!metaData.isValid()) throw std::runtime_error("cudaAddBlindData: invalid combination of blind meta data");
+    std::strcpy(metaData.mName, name);
+    auto buffer = BufferT::create(tmp[GRID] + tmp[META] + sizeof(GridBlindMetaData) + tmp[DATA] + metaData.blindDataSize(), &pool, false);
+    void *d_data = buffer.deviceData();
+
+    // 1:   |-----------|----------|
+    //        old grid    old meta
+    cudaCheck(cudaMemcpyAsync(d_data, d_grid, tmp[GRID] + tmp[META], cudaMemcpyDeviceToDevice, stream));
+
+    // 2:   |-----------|----------|----------|
+    //        old grid    old meta   new meta
+    cudaCheck(cudaMemcpyAsync((char*)d_data + tmp[GRID] + tmp[META], &metaData, sizeof(GridBlindMetaData), cudaMemcpyHostToDevice, stream));
+
+    // 3:   |-----------|----------|----------|-----------|
+    //        old grid    old meta   new meta   old data
+    cudaCheck(cudaMemcpyAsync((char*)d_data + tmp[GRID] + tmp[META] + sizeof(GridBlindMetaData),
+                 (const char*)d_grid + tmp[GRID] + tmp[META], tmp[DATA], cudaMemcpyDeviceToDevice, stream));
+
+    // 4:   |-----------|----------|----------|-----------|------------|
+    //        old grid    old meta   new meta    old data    new data
+    const size_t dataSize = valueCount*sizeof(BlindDataT);// no padding
+    cudaCheck(cudaMemcpyAsync((char*)d_data + tmp[GRID] + tmp[META] + sizeof(GridBlindMetaData) + tmp[DATA],
+                              d_blindData, dataSize, cudaMemcpyDeviceToDevice, stream));
+    if (auto padding = metaData.blindDataSize() - dataSize) {// zero out possible padding
+        cudaCheck(cudaMemsetAsync((char*)d_data + tmp[GRID] + tmp[META] + sizeof(GridBlindMetaData) + tmp[DATA] + dataSize, 0, padding, stream));
+    }
+
+    // increment grid size and blind data counter in output grid
+    util::cuda::lambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {
+        auto &grid = *reinterpret_cast<NanoGrid<BuildT>*>(d_data);
+        grid.mBlindMetadataCount += 1;
+        grid.mBlindMetadataOffset = d_tmp[GRID];
+        auto *meta = util::PtrAdd<GridBlindMetaData>(d_data, grid.mBlindMetadataOffset);// points to first blind meta data
+        for (uint32_t i=0, n=grid.mBlindMetadataCount-1; i<n; ++i, ++meta) meta->mDataOffset += sizeof(GridBlindMetaData);
+        grid.mGridSize += sizeof(GridBlindMetaData) + meta->blindDataSize();// expansion with 32 byte alignment
+    }); cudaCheckError();
+    cudaCheck(util::cuda::freeAsync(d_tmp, stream));
+
+    Checksum cs(tmp[CHECKSUM]);
+    cuda::updateChecksum(reinterpret_cast<GridData*>(d_data), cs.mode(), stream);
+
+    return GridHandle<BufferT>(std::move(buffer));
+}// cudaAddBlindData
+
+}// namespace tools::cuda
+
+template<typename BuildT, typename BlindDataT, typename BufferT = cuda::DeviceBuffer>
+[[deprecated("Use nanovdb::cuda::addBlindData instead")]]
+GridHandle<BufferT>
+cudaAddBlindData(const NanoGrid<BuildT> *d_grid,
+                 const BlindDataT *d_blindData,
+                 uint64_t valueCount,
+                 GridBlindDataClass blindClass   = GridBlindDataClass::Unknown,
+                 GridBlindDataSemantic semantics = GridBlindDataSemantic::Unknown,
+                 const char *name = "",
+                 const BufferT &pool = BufferT(),
+                 cudaStream_t stream = 0)
+{
+    return tools::cuda::addBlindData<BuildT, BlindDataT, BufferT>(d_grid, d_blindData, valueCount, blindClass, semantics, name, pool, stream);
+}
+
+}// namespace nanovdb
+
+#endif // NVIDIA_TOOLS_CUDA_ADDBLINDDATA_CUH_HAS_BEEN_INCLUDED
\ No newline at end of file
diff --git a/nanovdb/nanovdb/tools/cuda/GridChecksum.cuh b/nanovdb/nanovdb/tools/cuda/GridChecksum.cuh
new file mode 100644
index 0000000000..b1f61e2fe7
--- /dev/null
+++ b/nanovdb/nanovdb/tools/cuda/GridChecksum.cuh
@@ -0,0 +1,441 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    \file nanovdb/tools/cuda/GridChecksum.cuh
+
+    \author Ken Museth
+
+    \date September 28, 2023
+
+    \brief Compute CRC32 checksum of NanoVDB grids
+
+    \note before v32.6.0: checksum[0] = Grid+Tree+Root, checksum[1] = nodes
+          after  v32.6.0: checksum[0] = Grid+Tree,      checksum[1] = nodes + blind data in 4K blocks
+
+    When serialized:
+                                [Grid,Tree][Root][ROOT TILES...][Node<5>...][Node<4>...][Leaf<3>...][BlindMeta...][BlindData...]
+    checksum[2] before v32.6.0: <------------- [0] ------------><-------------- [1] --------------->
+    checksum[]2 after  v32.6.0: <---[0]---><----------------------------------------[1]---------------------------------------->
+*/
+
+#ifndef NANOVDB_TOOLS_CUDA_GRIDCHECKSUM_CUH_HAS_BEEN_INCLUDED
+#define NANOVDB_TOOLS_CUDA_GRIDCHECKSUM_CUH_HAS_BEEN_INCLUDED
+
+#include <nanovdb/util/cuda/Util.h>
+#include <nanovdb/cuda/DeviceBuffer.h>// required for instantiation of move c-tor of GridHandle
+#include <nanovdb/cuda/NodeManager.cuh>
+#include <nanovdb/tools/GridChecksum.h>
+#include <nanovdb/GridHandle.h>
+
+namespace nanovdb {// =======================================================================
+
+namespace tools::cuda {// ===================================================================
+
+/// @brief Compute the (2 x CRC32) checksum of the specified @c d_gridData on the device
+/// @param d_gridData Device base pointer to the grid from which the checksum is computed.
+/// @param mode Defines the mode of computation for the checksum.
+/// @param stream optional cuda stream (defaults to zero)
+/// @return Return the (2 x CRC32) checksum of the specified @c d_gridData
+Checksum evalChecksum(const GridData *d_gridData, CheckMode mode = CheckMode::Default, cudaStream_t stream = 0);
+
+/// @brief Extract the checksum of a device grid
+/// @param d_gridData Device basepointer to grid with a checksum
+/// @param stream optional cuda stream (defaults to zero)
+/// @return Checksum encoded in the specified grid
+Checksum getChecksum(const GridData *d_gridData, cudaStream_t stream = 0);
+
+/// @brief Return true if the checksum of @c d_gridData matches the expected
+///        value already encoded into the grid's meta data.
+/// @tparam BuildT Template parameter used to build NanoVDB grid.
+/// @param d_gridData Grid whose checksum is validated.
+/// @param mode Defines the mode of computation for the checksum.
+/// @param stream optional cuda stream (defaults to zero)
+bool validateChecksum(const GridData *d_gridData, CheckMode mode = CheckMode::Default, cudaStream_t stream = 0);
+
+/// @brief Update the checksum of a device grid
+/// @param d_gridData device pointer to GridData
+/// @param mode Mode of computation for the checksum.
+/// @param stream optional cuda stream (defaults to zero)
+void updateChecksum(GridData *d_gridData, CheckMode mode, cudaStream_t stream = 0);
+
+/// @brief  Updates the checksum of a device grid by preserving its mode
+/// @param d_gridData Device base pointer to grid
+/// @param stream optional cuda stream (defaults to zero)
+inline void updateChecksum(GridData *d_gridData, cudaStream_t stream = 0)
+{
+    updateChecksum(d_gridData, getChecksum(d_gridData, stream).mode(), stream);
+}
+
+}// namespace tools::cuda
+
+namespace util::cuda {
+
+/// @brief Cuda kernel that computes CRC32 checksums of blocks of data using a look-up-table
+/// @param d_data device pointer to raw data from wich to compute the CRC32 checksums
+/// @param d_blockCRC device pointer to array of @c blockCount checksums for each block
+/// @param blockCount number of blocks and checksums
+/// @param blockSize size of each block in bytes
+/// @param d_lut device pointer to CRC32 Lookup Table
+template <typename T>
+__global__ void crc32Kernel(const T *d_data, uint32_t* d_blockCRC, uint32_t blockCount, uint32_t blockSize, const uint32_t *d_lut)
+{
+    const uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < blockCount) d_blockCRC[tid] = crc32((const uint8_t*)d_data + tid * blockSize, blockSize, d_lut);
+}
+
+/// @brief Cuda kernel that computes CRC32 checksums of blocks of data (without using a look-up-table)
+/// @param d_data device pointer to raw data from wich to compute the CRC32 checksums
+/// @param d_blockCRC device pointer to array of @c blockCount checksums for each block
+/// @param blockCount number of blocks and checksums
+/// @param blockSize size of each block in bytes
+template <typename T>
+__global__ void crc32Kernel(const T *d_data, uint32_t* d_blockCRC, uint32_t blockCount, uint32_t blockSize)
+{
+    const uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < blockCount) d_blockCRC[tid] = crc32((const uint8_t*)d_data + tid * blockSize, blockSize);
+}
+
+/// @brief Host function to allocate and initiate a Look-Up-Table of size 256 for subsequent CRC32 computation on the device
+/// @param extra number of extra elements in the LUT
+/// @param stream optional cuda stream (defaults to zero)
+/// @return returns a nanovdb::util::cuda::unique_ptr point to a lookup-table for CRC32 computation
+inline unique_ptr<uint32_t> createCrc32Lut(size_t extra = 0, cudaStream_t stream = 0)
+{
+    unique_ptr<uint32_t> lut(256 + extra, stream);
+    uint32_t *d_lut = lut.get();
+    lambdaKernel<<<1, 256, 0, stream>>>(256, [=] __device__(size_t tid) {initCrc32Lut(d_lut, tid);});
+    cudaCheckError();
+    return lut;
+}
+
+/// @brief Compute CRC32 checksum of 4K block
+/// @param d_data device pointer to start of data
+/// @param size number of bytes
+/// @param d_lut Look-Up-Table for CRC32 computation
+/// @param stream optional cuda stream (defaults to zero)
+inline void blockedCRC32(const void *d_data, size_t size, const uint32_t *d_lut, uint32_t *d_crc, cudaStream_t stream)
+{
+    NANOVDB_ASSERT(d_data && d_lut && d_crc);
+    static constexpr unsigned int threadsPerBlock = 128;// seems faster than the old value of 256!
+    const uint64_t checksumCount = size >> NANOVDB_CRC32_LOG2_BLOCK_SIZE;// 4 KB (4096 byte)
+    unique_ptr<uint32_t> buffer(checksumCount, stream);// for checksums of 4 KB blocks
+    uint32_t *d_checksums = buffer.get();
+    lambdaKernel<<<blocksPerGrid(checksumCount, threadsPerBlock), threadsPerBlock, 0, stream>>>(checksumCount, [=] __device__(size_t tid) {
+        uint32_t blockSize = 1 << NANOVDB_CRC32_LOG2_BLOCK_SIZE;
+        if (tid+1 == checksumCount) blockSize += size - (checksumCount<<NANOVDB_CRC32_LOG2_BLOCK_SIZE);
+        d_checksums[tid] = crc32((const uint8_t*)d_data + (tid<<NANOVDB_CRC32_LOG2_BLOCK_SIZE), blockSize, d_lut);
+    }); cudaCheckError();
+    lambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {// Compute CRC32 of all the 4K blocks
+        *d_crc = crc32((const uint8_t*)d_checksums, checksumCount*sizeof(uint32_t), d_lut);
+    }); cudaCheckError();
+}// void cudaBlockedCRC32(const void *d_data, size_t size, const uint32_t *d_lut, uint32_t *d_crc, cudaStream_t stream)
+
+/// @brief Compute CRC32 checksum of 4K block
+/// @param d_begin device pointer to start of data (inclusive)
+/// @param d_end device pointer to end of data (exclusive)
+/// @param d_lut pointer to Look-Up-Table for accelerated CRC32 computation
+/// @param stream optional cuda stream (defaults to zero)
+inline void blockedCRC32(const void *d_begin, const void *d_end, const uint32_t *d_lut, uint32_t *d_crc, cudaStream_t stream)
+{
+    blockedCRC32(d_begin, PtrDiff(d_end, d_begin), d_lut, d_crc, stream);
+}
+
+}// namespace util::cuda
+
+namespace tools::cuda {
+
+/// @brief
+/// @param d_gridData
+/// @param d_lut pointer to Look-Up-Table for accelerated CRC32 computation
+/// @param d_crc
+/// @param stream optional cuda stream (defaults to zero)
+inline void crc32Head(const GridData *d_gridData, const uint32_t *d_lut, uint32_t *d_crc, cudaStream_t stream)
+{
+    NANOVDB_ASSERT(d_gridData && d_lut && d_crc);
+    util::cuda::lambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t){*d_crc = tools::crc32Head(d_gridData, d_lut);});
+}// void cudaCrc32Head(const GridData *d_gridData, const uint32_t *d_lut, uint32_t *d_crc, cudaStream_t stream)
+
+/// @brief
+/// @param d_gridData
+/// @param gridData
+/// @param d_lut pointer to Look-Up-Table for accelerated CRC32 computation
+/// @param stream optional cuda stream (defaults to zero)
+inline void crc32Tail(const GridData *d_gridData, const GridData *gridData, const uint32_t *d_lut, uint32_t *d_crc, cudaStream_t stream)
+{
+    NANOVDB_ASSERT(d_gridData && gridData && d_lut && d_crc);
+    NANOVDB_ASSERT(gridData->mVersion > Version(32,6,0));
+    const uint8_t *d_begin = (const uint8_t*)d_gridData;
+    util::cuda::blockedCRC32(d_begin + sizeof(GridData) + sizeof(TreeData), d_begin + gridData->mGridSize, d_lut, d_crc, stream);
+}
+
+/// @brief
+/// @tparam ValueT
+/// @param d_grid
+/// @param gridData
+/// @param d_lut pointer to Look-Up-Table for accelerated CRC32 computation
+/// @param d_crc
+/// @param stream
+template <typename ValueT>
+void crc32TailOld(const NanoGrid<ValueT> *d_grid, const GridData *gridData, const uint32_t *d_lut, uint32_t *d_crc, cudaStream_t stream)
+{
+    static constexpr unsigned int threadsPerBlock = 128;// seems faster than the old value of 256!
+    auto nodeMgrHandle = nanovdb::cuda::createNodeManager<ValueT, nanovdb::cuda::DeviceBuffer>(d_grid, nanovdb::cuda::DeviceBuffer(), stream);
+    auto *d_nodeMgr = nodeMgrHandle.template deviceMgr<ValueT>();
+    NANOVDB_ASSERT(isAligned(d_nodeMgr));
+    const uint32_t nodeCount[3]={gridData->template nodeCount<0>(), gridData->template nodeCount<1>(), gridData->template nodeCount<2>()};
+    util::cuda::unique_ptr<uint32_t> d_checksumsUP(nodeCount[0]+nodeCount[1]+nodeCount[2]);
+    uint32_t *d_checksums = d_checksumsUP.get(), *d_ptr = d_checksums;
+
+    util::cuda::lambdaKernel<<<util::cuda::blocksPerGrid(nodeCount[2], threadsPerBlock), threadsPerBlock, 0, stream>>>(nodeCount[2], [=] __device__(size_t tid) {
+        auto &node = d_nodeMgr->upper(uint32_t(tid));
+        d_ptr[tid] = util::crc32((const uint8_t*)&node, node.memUsage(), d_lut);
+    }); cudaCheckError();
+
+    d_ptr += nodeCount[2];
+    util::cuda::lambdaKernel<<<util::cuda::blocksPerGrid(nodeCount[1], threadsPerBlock), threadsPerBlock, 0, stream>>>(nodeCount[1], [=] __device__(size_t tid) {
+        auto &node = d_nodeMgr->lower(uint32_t(tid));
+        d_ptr[tid] = util::crc32((const uint8_t*)&node, node.memUsage(), d_lut);
+    }); cudaCheckError();
+
+    d_ptr += nodeCount[1];
+    util::cuda::lambdaKernel<<<util::cuda::blocksPerGrid(nodeCount[0], threadsPerBlock), threadsPerBlock, 0, stream>>>(nodeCount[0], [=] __device__(size_t tid) {
+        auto &node = d_nodeMgr->leaf(uint32_t(tid));
+        d_ptr[tid] = util::crc32((const uint8_t*)&node, node.memUsage(), d_lut);
+    }); cudaCheckError();
+
+    util::cuda::lambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {
+        *d_crc = util::crc32(d_checksums, d_nodeMgr->tree().totalNodeCount()*sizeof(uint32_t), d_lut);
+    }); cudaCheckError();
+}// void cudaCrc32TailOld(const NanoGrid<ValueT> *d_grid, const GridData *gridData, uint32_t *d_lut, cudaStream_t stream)
+
+struct Crc32TailOld {
+    template <typename BuildT>
+    static void known(const GridData *d_gridData, const GridData *gridData, const uint32_t *d_lut, uint32_t *d_crc, cudaStream_t stream)
+    {
+        crc32TailOld((const NanoGrid<BuildT>*)d_gridData, gridData, d_lut, d_crc, stream);
+    }
+    static void unknown(const GridData*, const GridData*, const uint32_t*, uint32_t*, cudaStream_t)
+    {
+        throw std::runtime_error("Cannot call cudaCrc32TailOld with grid of unknown type");
+    }
+};// Crc32TailOld
+
+/// @brief
+/// @param d_gridData
+/// @param mode
+/// @param stream
+/// @return
+inline Checksum evalChecksum(const GridData *d_gridData, CheckMode mode, cudaStream_t stream)
+{
+    static const int headSize = sizeof(GridData) + sizeof(TreeData);
+    NANOVDB_ASSERT(d_gridData);
+    Checksum cs;
+    if (mode != CheckMode::Empty) {
+        auto d_lut = util::cuda::createCrc32Lut(1, stream);
+        crc32Head(d_gridData, d_lut.get(), d_lut.get() + 256, stream);
+        cudaCheck(cudaMemcpyAsync(&(cs.head()), d_lut.get() + 256, headSize, cudaMemcpyDeviceToHost, stream));
+        if (mode == CheckMode::Full) {
+            std::unique_ptr<char[]> buffer(new char[headSize]);
+            auto *gridData = (GridData*)(buffer.get());
+            cudaCheck(cudaMemcpyAsync(gridData, d_gridData, headSize, cudaMemcpyDeviceToHost, stream));
+            if (gridData->mVersion > Version(32,6,0)) {
+                crc32Tail(d_gridData, gridData, d_lut.get(), d_lut.get() + 256, stream);
+            } else {
+                callNanoGrid<Crc32TailOld>(d_gridData, gridData, d_lut.get(), d_lut.get() + 256, stream);
+            }
+            cudaCheck(cudaMemcpyAsync(&(cs.tail()), d_lut.get() + 256, headSize, cudaMemcpyDeviceToHost, stream));
+        }
+    }
+    return cs;
+}
+
+/// @brief
+/// @tparam BuildT
+/// @param d_grid
+/// @param mode
+/// @param stream
+/// @return
+template <typename BuildT>
+Checksum evalChecksum(const NanoGrid<BuildT> *d_grid, CheckMode mode, cudaStream_t stream = 0)
+{
+    static const int headSize = sizeof(GridData) + sizeof(TreeData);
+    NANOVDB_ASSERT(d_grid);
+    Checksum cs;
+    if (mode != CheckMode::Empty) {
+        auto d_lut = util::cuda::createCrc32Lut(1, stream);
+        crc32Head(d_grid, d_lut.get(), d_lut.get() + 256, stream);
+        cudaCheck(cudaMemcpyAsync(&(cs.head()), d_lut.get() + 256, headSize, cudaMemcpyDeviceToHost, stream));
+        if (mode == CheckMode::Full) {
+            std::unique_ptr<char[]> buffer(new char[headSize]);
+            auto *gridData = (GridData*)(buffer.get());
+            cudaCheck(cudaMemcpyAsync(gridData, d_grid, headSize, cudaMemcpyDeviceToHost, stream));
+            if (gridData->mVersion > Version(32,6,0)) {
+                crc32Tail(d_grid, gridData, d_lut.get(), d_lut.get() + 256, stream);
+            } else {
+                crc32TailOld(d_grid, gridData, d_lut.get(), d_lut.get() + 256, stream);
+            }
+            cudaCheck(cudaMemcpyAsync(&(cs.tail()), d_lut.get() + 256, headSize, cudaMemcpyDeviceToHost, stream));
+        }
+    }
+    return cs;
+}
+
+/// @brief
+/// @param d_gridData
+/// @param mode
+/// @param stream
+/// @return
+inline bool validateChecksum(const GridData *d_gridData, CheckMode mode, cudaStream_t stream)
+{
+    static const int headSize = sizeof(GridData) + sizeof(TreeData);
+    NANOVDB_ASSERT(d_gridData);
+    if (mode == CheckMode::Empty) return true;
+
+    // Copy just the GridData from the device to the host
+    std::unique_ptr<char[]> buffer(new char[headSize]);
+    auto *gridData = (GridData*)(buffer.get());
+    cudaCheck(cudaMemcpyAsync(gridData, d_gridData, headSize, cudaMemcpyDeviceToHost, stream));
+    if (gridData->mChecksum.isEmpty()) return true;// checksum is empty so nothing to check
+
+    // Allocate device LUT for CRC32 computation
+    auto d_lut = util::cuda::createCrc32Lut(1, stream);// unique pointer
+    uint32_t crc = 0, *d_crc = d_lut.get() + 256;
+
+    // Check head checksum
+    crc32Head(d_gridData, d_lut.get(), d_crc, stream);
+    cudaCheck(cudaMemcpyAsync(&crc, d_crc, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
+    const bool checkHead = (crc == gridData->mChecksum.head());
+    if (gridData->mChecksum.isHalf() || mode == CheckMode::Half || !checkHead) return checkHead;
+
+    // Check tail checksum
+    if (gridData->mVersion > Version(32,6,0)) {
+        crc32Tail(d_gridData, gridData, d_lut.get(), d_crc, stream);
+    } else {
+        callNanoGrid<Crc32TailOld>(d_gridData, gridData, d_lut.get(), d_crc, stream);
+    }
+    cudaCheck(cudaMemcpyAsync(&crc, d_crc, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
+    return crc == gridData->mChecksum.tail();
+}// bool cudaValidateChecksum(const GridData *d_gridData, CheckMode mode, cudaStream_t stream = 0)
+
+/// @brief
+/// @tparam BuildT
+/// @param d_grid
+/// @param mode
+/// @param stream
+/// @return
+template <typename BuildT>
+bool validateChecksum(const NanoGrid<BuildT> *d_grid, CheckMode mode, cudaStream_t stream = 0)
+{
+    static const int headSize = sizeof(GridData) + sizeof(TreeData);
+    NANOVDB_ASSERT(d_grid);
+    if (mode == CheckMode::Empty) return true;
+
+    // Copy just the GridData from the device to the host
+    std::unique_ptr<char[]> buffer(new char[headSize]);
+    auto *gridData = (GridData*)(buffer.get());
+    cudaCheck(cudaMemcpyAsync(gridData, d_grid, headSize, cudaMemcpyDeviceToHost, stream));
+    if (gridData->mChecksum.isEmpty()) return true;// checksum is empty so nothing to check
+
+    // Allocate device LUT for CRC32 computation
+    auto d_lut = util::cuda::createCrc32Lut(1, stream);// unique pointer
+    uint32_t crc = 0, *d_crc = d_lut.get() + 256;
+
+    // Check head checksum
+    crc32Head(d_grid, d_lut.get(), d_crc, stream);
+    cudaCheck(cudaMemcpyAsync(&crc, d_crc, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
+    const bool checkHead = (crc == gridData->mChecksum.head());
+    if (gridData->mChecksum.isHalf() || mode == CheckMode::Half || !checkHead) return checkHead;
+
+    // Check tail checksum
+    if (gridData->mVersion > Version(32,6,0)) {
+        crc32Tail(d_grid, gridData, d_lut.get(), d_crc, stream);
+    } else {
+        crc32TailOld(d_grid, gridData, d_lut.get(), d_crc, stream);
+    }
+    cudaCheck(cudaMemcpyAsync(&crc, d_crc, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
+    return crc == gridData->mChecksum.tail();
+}// bool cudaValidateChecksum(const GridData *d_gridData, CheckMode mode, cudaStream_t stream = 0)
+
+/// @brief Extract the checksum of a device grid
+/// @param d_gridData Device pointer to grid with a checksum
+/// @param stream optional cuda stream (defaults to zero)
+inline Checksum getChecksum(const GridData *d_gridData, cudaStream_t stream)
+{
+    NANOVDB_ASSERT(d_gridData);
+    Checksum cs;
+    cudaCheck(cudaMemcpyAsync(&cs, (const uint8_t*)d_gridData + 8, sizeof(cs), cudaMemcpyDeviceToHost, stream));
+    return cs;
+}
+
+/// @brief Update the checksum of a device grid
+/// @param d_gridData device pointer to GridData
+/// @param mode Mode of computation for the checksum.
+/// @param stream optional cuda stream (defaults to zero)
+/// @return The actual mode used for checksum computation. Eg. if @c d_gridData is NULL (or @c mode = CheckMode::Empty)
+///         then CheckMode::Empty is always returned. Else if the grid has no nodes or blind data CheckMode::Partial
+///         is always returnd (even if @c mode = CheckMode::Full).
+inline void updateChecksum(GridData *d_gridData, CheckMode mode, cudaStream_t stream)
+{
+    NANOVDB_ASSERT(d_gridData);
+    if (mode == CheckMode::Empty) return;
+
+    // Allocate device LUT for CRC32 computation
+    auto d_lut = util::cuda::createCrc32Lut(0, stream);// unique pointers
+
+    // Update head checksum
+    crc32Head(d_gridData, d_lut.get(), (uint32_t*)d_gridData + 2, stream);
+
+    if (mode == CheckMode::Half) return;
+
+    // Copy just the GridData from the device to the host
+    std::unique_ptr<char[]> buffer(new char[sizeof(GridData) + sizeof(TreeData)]);
+    auto *gridData = (GridData*)(buffer.get());
+    cudaCheck(cudaMemcpyAsync(gridData, d_gridData, sizeof(GridData) + sizeof(TreeData), cudaMemcpyDeviceToHost, stream));
+
+    // Update tail checksum
+    uint32_t *d_tail = (uint32_t*)d_gridData + 3;
+    if (gridData->mVersion > Version(32,6,0)) {
+        crc32Tail(d_gridData, gridData, d_lut.get(), d_tail, stream);
+    } else {
+        callNanoGrid<Crc32TailOld>(d_gridData, gridData, d_lut.get(), d_tail, stream);
+    }
+}// cudaUpdateChecksum
+
+/// @brief
+/// @tparam ValueT
+/// @param d_grid
+/// @param mode
+/// @param stream
+template <typename ValueT>
+void updateChecksum(NanoGrid<ValueT> *d_grid, CheckMode mode, cudaStream_t stream = 0)
+{
+    NANOVDB_ASSERT(d_grid);
+    if (mode == CheckMode::Empty) return;
+
+    // Allocate device LUT for CRC32 computation
+    auto d_lut = util::cuda::createCrc32Lut(0, stream);// unique pointers
+
+    // Update head checksum
+    cuda::crc32Head(d_grid, d_lut.get(), (uint32_t*)d_grid + 2, stream);
+    if (mode == CheckMode::Half) return;
+
+    // Copy just the GridData from the device to the host
+    std::unique_ptr<char[]> buffer(new char[sizeof(GridData) + sizeof(TreeData)]);
+    auto *gridData = (GridData*)(buffer.get());
+    cudaCheck(cudaMemcpyAsync(gridData, d_grid, sizeof(GridData) + sizeof(TreeData), cudaMemcpyDeviceToHost, stream));
+
+    // Update tail checksum
+    uint32_t *d_tail = (uint32_t*)d_grid + 3;
+    if (gridData->mVersion > Version(32,6,0)) {
+        crc32Tail(d_grid->data(), gridData, d_lut.get(), d_tail, stream);
+    } else {
+        crc32TailOld(d_grid, gridData, d_lut.get(), d_tail, stream);
+    }
+}
+
+}// namespace tools::cuda // ================================================
+
+}// namespace nanovdb // ====================================================
+
+#endif // NANOVDB_TOOLS_CUDA_GRIDCHECKSUM_CUH_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/tools/cuda/GridStats.cuh b/nanovdb/nanovdb/tools/cuda/GridStats.cuh
new file mode 100644
index 0000000000..34c615f6d2
--- /dev/null
+++ b/nanovdb/nanovdb/tools/cuda/GridStats.cuh
@@ -0,0 +1,249 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    \file nanovdb/tools/cuda/GridStats.cuh
+
+    \author Ken Museth
+
+    \date October 9, 2023
+
+    \brief Re-computes min/max/avg/var/bbox information for each node in a
+           pre-existing NanoVDB grid on the device.
+*/
+
+#ifndef NANOVDB_TOOLS_CUDA_GRIDSTATS_CUH_HAS_BEEN_INCLUDED
+#define NANOVDB_TOOLS_CUDA_GRIDSTATS_CUH_HAS_BEEN_INCLUDED
+
+#include <nanovdb/NanoVDB.h>
+#include <nanovdb/tools/GridStats.h>
+
+namespace nanovdb {
+
+namespace tools::cuda {
+
+/// @brief Update, i.e. re-compute, grid statistics like min/max, stats and bbox
+///        information for an existing NanoVDB Grid.
+/// @param grid   Grid whose stats to update
+/// @param mode   Mode of computation for the statistics.
+/// @param stream Optional cuda stream (defaults to zero)
+template<typename BuildT>
+void updateGridStats(NanoGrid<BuildT> *d_grid, StatsMode mode = StatsMode::Default, cudaStream_t stream = 0);
+
+//================================================================================================
+
+/// @brief Allows for the construction of NanoVDB grids without any dependecy
+template<typename BuildT, typename StatsT = Stats<typename NanoGrid<BuildT>::ValueType>>
+class GridStats
+{
+    using GridT  = NanoGrid<BuildT>;
+    using TreeT  = typename GridT::TreeType;
+    using ValueT = typename TreeT::ValueType;
+    using Node0  = typename TreeT::Node0; // leaf
+    using Node1  = typename TreeT::Node1; // lower
+    using Node2  = typename TreeT::Node2; // upper
+    using RootT  = typename TreeT::Node3; // root
+    static_assert(util::is_same<ValueT, typename StatsT::ValueType>::value, "Mismatching type");
+
+    ValueT mDelta; // skip rendering of node if: node.max < -mDelta || node.min > mDelta
+
+public:
+    GridStats(ValueT delta = ValueT(0)) : mDelta(delta) {}
+
+    void update(GridT *d_grid, cudaStream_t stream = 0);
+
+}; // cuda::GridStats
+
+//================================================================================================
+
+namespace {// define cuda kernels in an unnamed namespace
+
+template<typename BuildT, typename StatsT>
+__global__ void processLeaf(NodeManager<BuildT> *d_nodeMgr, StatsT *d_stats)
+{
+    const uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= d_nodeMgr->leafCount()) return;
+    auto &d_leaf = d_nodeMgr->leaf(tid);
+
+    if (d_leaf.updateBBox()) {// updates active bounding box (also updates data->mFlags) and return true if non-empty
+        if constexpr(StatsT::hasStats()) {
+            StatsT stats;
+            for (auto it = d_leaf.cbeginValueOn(); it; ++it) stats.add(*it);
+            if constexpr(StatsT::hasAverage()) {
+                d_stats[tid] = stats;
+                *reinterpret_cast<uint32_t*>(&d_leaf.mMinimum) = tid;
+            } else {
+                stats.setStats(d_leaf);
+            }
+        }
+    }
+    d_leaf.mFlags &= ~uint8_t(1u);// enable rendering
+}// processLeaf
+
+template<typename BuildT, typename StatsT, int LEVEL>
+__global__ void processInternal(NodeManager<BuildT> *d_nodeMgr, StatsT *d_stats)
+{
+    using ChildT = typename NanoNode<BuildT,LEVEL-1>::type;
+    const uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= d_nodeMgr->nodeCount(LEVEL)) return;
+    auto &d_node = d_nodeMgr->template node<LEVEL>(tid);
+    auto &bbox   = d_node.mBBox;
+    bbox         = CoordBBox();// empty bbox
+    StatsT stats;
+    uint32_t childID = 0u;
+
+    for (auto it = d_node.beginChild(); it; ++it) {
+        auto &child = *it;
+        bbox.expand( child.bbox() );
+        if constexpr(StatsT::hasAverage()) {
+            childID = *reinterpret_cast<uint32_t*>(&child.mMinimum);
+            StatsT &s = d_stats[childID];
+            s.setStats(child);
+            stats.add(s);
+        } else if constexpr(StatsT::hasMinMax()) {
+            stats.add(child.minimum());
+            stats.add(child.maximum());
+        }
+    }
+    for (auto it = d_node.cbeginValueOn(); it; ++it) {
+        const Coord ijk = it.getCoord();
+        bbox[0].minComponent(ijk);
+        bbox[1].maxComponent(ijk + Coord(ChildT::DIM - 1));
+        if constexpr(StatsT::hasStats()) stats.add(*it, ChildT::NUM_VALUES);
+    }
+    if constexpr(StatsT::hasAverage()) {
+        d_stats[childID] = stats;
+        *reinterpret_cast<uint32_t*>(&d_node.mMinimum) = childID;
+    } else if constexpr(StatsT::hasMinMax()) {
+        stats.setStats(d_node);
+    }
+    d_node.mFlags &= ~uint64_t(1u);// enable rendering
+}// processInternal
+
+template<typename BuildT, typename StatsT>
+__global__ void processRootAndGrid(NodeManager<BuildT> *d_nodeMgr, StatsT *d_stats)
+{
+    using ChildT = NanoUpper<BuildT>;
+    using ValueT = typename ChildT::ValueType;
+
+    // process root
+    auto &root = d_nodeMgr->root();
+    root.mBBox = CoordBBox();
+    if (root.isEmpty()) {
+        root.mMinimum = root.mMaximum = root.mBackground;
+        root.mAverage = root.mStdDevi = 0;
+    } else {
+        ValueT v;
+        StatsT s;
+        for (auto it = root.beginDense(); it; ++it) {
+            if (auto *child = it.probeChild(v)) {
+                root.mBBox.expand( child->bbox() );
+                if constexpr(StatsT::hasAverage()) {
+                    StatsT &stats = d_stats[*reinterpret_cast<uint32_t*>(&child->mMinimum)];
+                    stats.setStats(*child);
+                    s.add(stats);
+                } else if constexpr(StatsT::hasMinMax()){
+                    s.add(child->minimum());
+                    s.add(child->maximum());
+                }
+            } else if (it.isValueOn()) {
+                const Coord ijk = it.getCoord();
+                root.mBBox[0].minComponent(ijk);
+                root.mBBox[1].maxComponent(ijk + Coord(ChildT::DIM - 1));
+                if constexpr(StatsT::hasStats()) s.add(v, ChildT::NUM_VALUES);
+            }
+        }
+        s.setStats(root);
+    }
+
+    // process Grid
+    auto& grid = d_nodeMgr->grid();
+    const auto& indexBBox = root.bbox();
+    if (indexBBox.empty()) {
+        grid.mWorldBBox = Vec3dBBox();
+        grid.setBBoxOn(false);
+    } else {
+        // Note that below max is offset by one since CoordBBox.max is inclusive
+        // while bbox<Vec3d>.max is exclusive. However, min is inclusive in both
+        // CoordBBox and Vec3dBBox. This also guarantees that a grid with a single
+        // active voxel, does not have an empty world bbox! E.g. if a grid with a
+        // unit index-to-world transformation only contains the active voxel (0,0,0)
+        // then indeBBox = (0,0,0) -> (0,0,0) and then worldBBox = (0.0, 0.0, 0.0)
+        // -> (1.0, 1.0, 1.0). This is a consequence of the different definitions
+        // of index and world bounding boxes inherited from OpenVDB!
+        grid.mWorldBBox = CoordBBox(indexBBox[0], indexBBox[1].offsetBy(1)).transform(grid.map());
+        grid.setBBoxOn(true);
+    }
+
+    // set bit flags
+    grid.setMinMaxOn(StatsT::hasMinMax());
+    grid.setAverageOn(StatsT::hasAverage());
+    grid.setStdDeviationOn(StatsT::hasStdDeviation());
+}// processRootAndGrid
+
+}// cuda kernels are defined in an unnamed namespace
+
+//================================================================================================
+
+template<typename BuildT, typename StatsT>
+void GridStats<BuildT, StatsT>::update(NanoGrid<BuildT> *d_grid, cudaStream_t stream)
+{
+    static const uint32_t threadsPerBlock = 128;
+    auto blocksPerGrid = [&](uint32_t count)->uint32_t{return (count + (threadsPerBlock - 1)) / threadsPerBlock;};
+
+    auto nodeMgrHandle = nanovdb::cuda::createNodeManager(d_grid, CudaDeviceBuffer(), stream);
+    auto *d_nodeMgr = nodeMgrHandle.template deviceMgr<BuildT>();
+
+    uint32_t nodeCount[3];// {leaf, lower, upper}
+    cudaCheck(cudaMemcpyAsync(nodeCount, (char*)d_grid + sizeof(GridData) + 4*sizeof(uint64_t), 3*sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
+    //cudaStreamSynchronize(stream);// finish all device tasks in stream
+
+    StatsT *d_stats = nullptr;
+
+    if constexpr(StatsT::hasAverage()) cudaCheck(util::cuda::mallocAsync((void**)&d_stats, nodeCount[0]*sizeof(StatsT), stream));
+
+    processLeaf<BuildT><<<blocksPerGrid(nodeCount[0]), threadsPerBlock, 0, stream>>>(d_nodeMgr, d_stats);
+
+    processInternal<BuildT, StatsT, 1><<<blocksPerGrid(nodeCount[1]), threadsPerBlock, 0, stream>>>(d_nodeMgr, d_stats);
+
+    processInternal<BuildT, StatsT, 2><<<blocksPerGrid(nodeCount[2]), threadsPerBlock, 0, stream>>>(d_nodeMgr, d_stats);
+
+    processRootAndGrid<BuildT><<<1, 1, 0, stream>>>(d_nodeMgr, d_stats);
+
+    if constexpr(StatsT::hasAverage()) cudaCheck(util::cuda::freeAsync(d_stats, stream));
+
+} // cuda::GridStats::update( Grid )
+
+//================================================================================================
+
+template<typename BuildT>
+void updateGridStats(NanoGrid<BuildT> *d_grid, StatsMode mode, cudaStream_t stream)
+{
+    if (d_grid == nullptr && mode == StatsMode::Disable) {
+        return;
+    } else if (mode == StatsMode::BBox || util::is_same<bool, BuildT>::value) {
+        GridStats<BuildT, NoopStats<BuildT> > stats;
+        stats.update(d_grid, stream);
+    } else if (mode == StatsMode::MinMax) {
+        GridStats<BuildT, Extrema<BuildT> > stats;
+        stats.update(d_grid, stream);
+    } else if (mode == StatsMode::All) {
+        GridStats<BuildT, Stats<BuildT> > stats;
+        stats.update(d_grid, stream);
+    } else {
+        throw std::runtime_error("GridStats: Unsupported statistics mode.");
+    }
+}// cuda::updateGridStats
+
+}// namespace tools::cuda
+
+template<typename BuildT>
+[[deprecated("Use nanovdb::cuda::updateGridStats instead")]]
+void cudaGridStats(NanoGrid<BuildT> *d_grid, tools::StatsMode mode = tools::StatsMode::Default, cudaStream_t stream = 0)
+{
+    tools::cuda::updateGridStats<BuildT>(d_grid, mode, stream);
+}
+
+} // namespace nanovdb
+
+#endif // NANOVDB_TOOLS_CUDA_GRIDSTATS_CUH_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/tools/cuda/GridValidator.cuh b/nanovdb/nanovdb/tools/cuda/GridValidator.cuh
new file mode 100644
index 0000000000..2edfc0bdb1
--- /dev/null
+++ b/nanovdb/nanovdb/tools/cuda/GridValidator.cuh
@@ -0,0 +1,59 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    \file nanovdb/tools/cuda/GridValidator.cuh
+
+    \author Ken Museth
+
+    \date November 3, 2023
+
+    \brief Checks the validity of an existing NanoVDB device grid.
+*/
+
+#ifndef NANOVDB_TOOLS_CUDA_GRIDVALIDATOR_CUH_HAS_BEEN_INCLUDED
+#define NANOVDB_TOOLS_CUDA_GRIDVALIDATOR_CUH_HAS_BEEN_INCLUDED
+
+#include <nanovdb/NanoVDB.h>
+#include <nanovdb/tools/GridValidator.h>
+#include <nanovdb/tools/cuda/GridChecksum.cuh>
+#include <nanovdb/util/cuda/Util.h>
+
+namespace nanovdb {
+
+namespace tools::cuda {
+
+/// @brief Return true if the specified grid passes several validation tests.
+///
+/// @param grid Grid to validate
+/// @param detailed If true the validation test is detailed and relatively slow.
+/// @param verbose If true information about the first failed test is printed to std::cerr
+template <typename ValueT>
+bool isValid(const NanoGrid<ValueT> *d_grid, CheckMode mode, bool verbose = false, cudaStream_t stream = 0)
+{
+    static const int size = 100;
+    std::unique_ptr<char[]> strUP(new char[size]);
+    util::cuda::unique_ptr<char> d_strUP(size);
+    char *str = strUP.get(), *d_str = d_strUP.get();
+
+    util::cuda::lambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {nanovdb::tools::checkGrid(d_grid, d_str, mode);});
+    cudaMemcpyAsync(str, d_str, size, cudaMemcpyDeviceToHost, stream);
+
+    if (util::empty(str) && !cuda::validateChecksum(d_grid, mode)) util::strcpy(str, "Mis-matching checksum");
+    if (verbose && !util::empty(str)) std::cerr << "Validation failed: " << str << std::endl;
+
+    return util::empty(str);
+}// tools::cuda::isValid
+
+}// namespace tools::cuda
+
+template <typename ValueT>
+[[deprecated("Use cuda::isValid() instead.")]]
+bool cudaIsValid(const NanoGrid<ValueT> *d_grid, CheckMode mode, bool verbose = false, cudaStream_t stream = 0)
+{
+    return tools::cuda::isValid(d_grid, mode, verbose, stream);
+}
+
+} // namespace nanovdb
+
+#endif // NANOVDB_TOOLS_CUDA_GRIDVALIDATOR_CUH_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/tools/cuda/IndexToGrid.cuh b/nanovdb/nanovdb/tools/cuda/IndexToGrid.cuh
new file mode 100644
index 0000000000..756a4fde63
--- /dev/null
+++ b/nanovdb/nanovdb/tools/cuda/IndexToGrid.cuh
@@ -0,0 +1,407 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    \file nanovdb/tools/cuda/IndexToGrid.cuh
+
+    \author Ken Museth
+
+    \date April 17, 2023
+
+    \brief Combines an IndexGrid and values into a regular Grid on the device
+
+    \warning The header file contains cuda device code so be sure
+             to only include it in .cu files (or other .cuh files)
+*/
+
+#ifndef NVIDIA_TOOLS_CUDA_INDEXTOGRID_CUH_HAS_BEEN_INCLUDED
+#define NVIDIA_TOOLS_CUDA_INDEXTOGRID_CUH_HAS_BEEN_INCLUDED
+
+#include <nanovdb/NanoVDB.h>
+#include <nanovdb/cuda/DeviceBuffer.h>
+#include <nanovdb/GridHandle.h>
+#include <nanovdb/util/cuda/Timer.h>
+#include <nanovdb/util/cuda/Util.h>
+
+namespace nanovdb {// ================================================================
+
+namespace tools::cuda {// ============================================================
+
+/// @brief Freestanding function that combines an IndexGrid and values into a regular Grid
+/// @tparam DstBuildT Build time of the destination/output Grid
+/// @tparam SrcBuildT  Build type of the source/input IndexGrid
+/// @tparam BufferT Type of the buffer used for allocation of the destination Grid
+/// @param d_srcGrid Device pointer to source/input IndexGrid, i.e. SrcBuildT={ValueIndex,ValueOnIndex,ValueIndexMask,ValueOnIndexMask}
+/// @param d_srcValues Device pointer to an array of values
+/// @param pool Memory pool used to create a buffer for the destination/output Grid
+/// @param stream optional CUDA stream (defaults to CUDA stream 0
+/// @note If d_srcGrid has stats (min,max,avg,std-div), the d_srcValues is also assumed
+///       to have the same information, all of which are then copied to the destination/output grid.
+///       An exception to this rule is if the type of d_srcValues is different from the stats type
+///       NanoRoot<DstBuildT>::FloatType, e.g. if DstBuildT=Vec3f then NanoRoot<DstBuildT>::FloatType=float,
+///       in which case average and standard-deviation is undefined in the output grid.
+/// @return returns handle to grid that combined IndexGrid and values
+template<typename DstBuildT, typename SrcBuildT, typename BufferT = nanovdb::cuda::DeviceBuffer>
+typename util::enable_if<BuildTraits<SrcBuildT>::is_index, GridHandle<BufferT>>::type
+indexToGrid(const NanoGrid<SrcBuildT> *d_srcGrid, const typename BuildToValueMap<DstBuildT>::type *d_srcValues, const BufferT &pool = BufferT(), cudaStream_t stream = 0);
+
+
+template<typename DstBuildT, typename SrcBuildT, typename BufferT = nanovdb::cuda::DeviceBuffer>
+typename util::enable_if<BuildTraits<SrcBuildT>::is_index, GridHandle<BufferT>>::type
+createNanoGrid(const NanoGrid<SrcBuildT> *d_srcGrid, const typename BuildToValueMap<DstBuildT>::type *d_srcValues, const BufferT &pool = BufferT(), cudaStream_t stream = 0)
+{
+    return indexToGrid<DstBuildT, SrcBuildT, BufferT>(d_srcGrid, d_srcValues, pool, stream);
+}
+
+namespace {// anonymous namespace
+
+template<typename SrcBuildT>
+class IndexToGrid
+{
+    using SrcGridT = NanoGrid<SrcBuildT>;
+public:
+    struct NodeAccessor;
+
+    /// @brief Constructor from a source IndeGrid
+    /// @param srcGrid Device pointer to IndexGrid used as the source
+    IndexToGrid(const SrcGridT *d_srcGrid, cudaStream_t stream = 0);
+
+    ~IndexToGrid() {cudaCheck(util::cuda::freeAsync(mDevNodeAcc, mStream));}
+
+    /// @brief Toggle on and off verbose mode
+    /// @param on if true verbose is turned on
+    void setVerbose(bool on = true) {mVerbose = on; }
+
+    /// @brief Set the name of the destination/output grid
+    /// @param name Name used for the destination grid
+    void setGridName(const std::string &name) {mGridName = name;}
+
+    /// @brief Combines the IndexGrid with values to produce a regular Grid
+    /// @tparam DstBuildT Template parameter of the destination grid and value type
+    /// @tparam BufferT Template parameter of the memory allocator
+    /// @param srcValues pointer to values that will be inserted into the output grid
+    /// @param buffer optional buffer used for memory allocation
+    /// @return A new GridHandle with the grid of type @c DstBuildT
+    template<typename DstBuildT, typename BufferT = nanovdb::cuda::DeviceBuffer>
+    GridHandle<BufferT> getHandle(const typename BuildToValueMap<DstBuildT>::type *srcValues, const BufferT &buffer = BufferT());
+
+private:
+    cudaStream_t      mStream{0};
+    util::cuda::Timer mTimer;
+    std::string       mGridName;
+    bool              mVerbose{false};
+    NodeAccessor      mNodeAcc, *mDevNodeAcc;
+
+    template<typename DstBuildT, typename BufferT>
+    BufferT getBuffer(const BufferT &pool);
+};// IndexToGrid
+
+//================================================================================================
+
+template<typename SrcBuildT>
+struct IndexToGrid<SrcBuildT>::NodeAccessor
+{
+    uint64_t grid, tree, root, node[3], meta, blind, size;// byte offsets, node: 0=leaf,1=lower, 2=upper
+    const SrcGridT *d_srcGrid;// device point to source IndexGrid
+    void *d_dstPtr;// device pointer to buffer with destination Grid
+    char *d_gridName;
+    uint32_t nodeCount[4];// 0=leaf, 1=lower, 2=upper, 3=root tiles
+
+    __device__ const NanoGrid<SrcBuildT>& srcGrid() const {return *d_srcGrid;}
+    __device__ const NanoTree<SrcBuildT>& srcTree() const {return d_srcGrid->tree();}
+    __device__ const NanoRoot<SrcBuildT>& srcRoot() const {return d_srcGrid->tree().root();}
+    template <int LEVEL>
+    __device__ const typename NanoNode<SrcBuildT, LEVEL>::type& srcNode(int i) const {
+        return *(this->srcTree().template getFirstNode<LEVEL>() + i);
+    }
+
+    template <typename DstBuildT>
+    __device__ NanoGrid<DstBuildT>& dstGrid() const {return *util::PtrAdd<NanoGrid<DstBuildT>>(d_dstPtr, grid);}
+    template <typename DstBuildT>
+    __device__ NanoTree<DstBuildT>& dstTree() const {return *util::PtrAdd<NanoTree<DstBuildT>>(d_dstPtr, tree);}
+    template <typename DstBuildT>
+    __device__ NanoRoot<DstBuildT>& dstRoot() const {return *util::PtrAdd<NanoRoot<DstBuildT>>(d_dstPtr, root);}
+    template <typename DstBuildT, int LEVEL>
+    __device__ typename NanoNode<DstBuildT, LEVEL>::type& dstNode(int i) const {
+        return *(util::PtrAdd<typename NanoNode<DstBuildT,LEVEL>::type>(d_dstPtr, node[LEVEL])+i);
+    }
+};// IndexToGrid<SrcBuildT>::NodeAccessor
+
+//================================================================================================
+
+template<typename SrcBuildT, typename DstBuildT>
+__global__ void processGridTreeRootKernel(typename IndexToGrid<SrcBuildT>::NodeAccessor *nodeAcc,
+                                          const typename BuildToValueMap<DstBuildT>::type *srcValues)
+{
+    using SrcValueT = typename BuildToValueMap<DstBuildT>::type;
+    using DstStatsT = typename NanoRoot<DstBuildT>::FloatType;
+
+    auto &srcGrid = nodeAcc->srcGrid();
+    auto &dstGrid = nodeAcc->template dstGrid<DstBuildT>();
+    auto &srcTree = srcGrid.tree();
+    auto &dstTree = nodeAcc->template dstTree<DstBuildT>();
+    auto &srcRoot = srcTree.root();
+    auto &dstRoot = nodeAcc->template dstRoot<DstBuildT>();
+
+    // process Grid
+    *dstGrid.data() = *srcGrid.data();
+    dstGrid.mGridType = toGridType<DstBuildT>();
+    dstGrid.mData1 = 0u;
+    // we will recompute GridData::mChecksum later
+
+    // process Tree
+    *dstTree.data() = *srcTree.data();
+    dstTree.setRoot(&dstRoot);
+    dstTree.setFirstNode(&nodeAcc->template dstNode<DstBuildT,2>(0));
+    dstTree.setFirstNode(&nodeAcc->template dstNode<DstBuildT,1>(0));
+    dstTree.setFirstNode(&nodeAcc->template dstNode<DstBuildT,0>(0));
+
+    // process Root
+    dstRoot.mBBox = srcRoot.mBBox;
+    dstRoot.mTableSize = srcRoot.mTableSize;
+    dstRoot.mBackground = srcValues[srcRoot.mBackground];
+    if (srcGrid.hasMinMax()) {
+        dstRoot.mMinimum = srcValues[srcRoot.mMinimum];
+        dstRoot.mMaximum = srcValues[srcRoot.mMaximum];
+    }
+    if constexpr(util::is_same<SrcValueT, DstStatsT>::value) {// e.g. {float,float} or {Vec3f,float}
+        if (srcGrid.hasAverage())      dstRoot.mAverage = srcValues[srcRoot.mAverage];
+        if (srcGrid.hasStdDeviation()) dstRoot.mStdDevi = srcValues[srcRoot.mStdDevi];
+    }
+}// processGridTreeRootKernel
+
+//================================================================================================
+
+template<typename SrcBuildT, typename DstBuildT>
+__global__ void processRootTilesKernel(typename IndexToGrid<SrcBuildT>::NodeAccessor *nodeAcc,
+                                       const typename BuildToValueMap<DstBuildT>::type *srcValues)
+{
+    const auto tid = blockIdx.x;
+
+    // Process children and tiles
+    const auto &srcTile = *nodeAcc->srcRoot().tile(tid);
+    auto &dstTile = *nodeAcc->template dstRoot<DstBuildT>().tile(tid);
+    dstTile.key   = srcTile.key;
+    if (srcTile.child) {
+        dstTile.child = sizeof(NanoRoot<DstBuildT>) + sizeof(NanoRoot<DstBuildT>::Tile)*((srcTile.child - sizeof(NanoRoot<SrcBuildT>))/sizeof(NanoRoot<SrcBuildT>::Tile));
+        dstTile.value = srcValues[0];// set to background
+        dstTile.state = false;
+    } else {
+        dstTile.child = 0;// i.e. no child node
+        dstTile.value = srcValues[srcTile.value];
+        dstTile.state = srcTile.state;
+    }
+}// processRootTilesKernel
+
+//================================================================================================
+
+template<typename SrcBuildT, typename DstBuildT, int LEVEL>
+__global__ void processNodesKernel(typename IndexToGrid<SrcBuildT>::NodeAccessor *nodeAcc,
+                                   const typename BuildToValueMap<DstBuildT>::type *srcValues)
+{
+    using SrcNodeT  = typename NanoNode<SrcBuildT, LEVEL>::type;
+    using DstNodeT  = typename NanoNode<DstBuildT, LEVEL>::type;
+    using SrcChildT = typename SrcNodeT::ChildNodeType;
+    using DstChildT = typename DstNodeT::ChildNodeType;
+    using SrcValueT = typename BuildToValueMap<DstBuildT>::type;
+    using DstStatsT = typename NanoRoot<DstBuildT>::FloatType;
+
+    auto &srcNode = nodeAcc->template srcNode<LEVEL>(blockIdx.x);
+    auto &dstNode = nodeAcc->template dstNode<DstBuildT, LEVEL>(blockIdx.x);
+
+    if (threadIdx.x == 0 && threadIdx.y == 0) {
+        dstNode.mBBox = srcNode.mBBox;
+        dstNode.mFlags = srcNode.mFlags;
+        dstNode.mValueMask = srcNode.mValueMask;
+        dstNode.mChildMask = srcNode.mChildMask;
+        auto &srcGrid = nodeAcc->srcGrid();
+        if (srcGrid.hasMinMax()) {
+            dstNode.mMinimum = srcValues[srcNode.mMinimum];
+            dstNode.mMaximum = srcValues[srcNode.mMaximum];
+        }
+        if constexpr(util::is_same<SrcValueT, DstStatsT>::value) {// e.g. {float,float} or {Vec3f,float}
+            if (srcGrid.hasAverage())      dstNode.mAverage = srcValues[srcNode.mAverage];
+            if (srcGrid.hasStdDeviation()) dstNode.mStdDevi = srcValues[srcNode.mStdDevi];
+        }
+    }
+    const uint64_t nodeSkip = nodeAcc->nodeCount[LEVEL] - blockIdx.x, srcOff = sizeof(SrcNodeT)*nodeSkip, dstOff = sizeof(DstNodeT)*nodeSkip;// offset to first node of child type
+    const int off = blockDim.x*blockDim.y*threadIdx.x + blockDim.x*threadIdx.y;
+    for (int threadIdx_z=0; threadIdx_z<blockDim.x; ++threadIdx_z) {
+        const int i = off + threadIdx_z;
+        if (srcNode.mChildMask.isOn(i)) {
+            if constexpr(sizeof(SrcNodeT)==sizeof(DstNodeT) && sizeof(SrcChildT)==sizeof(DstChildT)) {
+                dstNode.mTable[i].child = srcNode.mTable[i].child;
+            } else {
+                const uint64_t childID = (srcNode.mTable[i].child - srcOff)/sizeof(SrcChildT);
+                dstNode.mTable[i].child = dstOff + childID*sizeof(DstChildT);
+            }
+        } else {
+            dstNode.mTable[i].value = srcValues[srcNode.mTable[i].value];
+        }
+    }
+}// processNodesKernel
+
+//================================================================================================
+
+template<typename SrcBuildT, typename DstBuildT>
+__global__ void processLeafsKernel(typename IndexToGrid<SrcBuildT>::NodeAccessor *nodeAcc,
+                                     const typename BuildToValueMap<DstBuildT>::type *srcValues)
+{
+    using SrcValueT = typename BuildToValueMap<DstBuildT>::type;
+    using DstStatsT = typename NanoRoot<DstBuildT>::FloatType;
+    static_assert(!BuildTraits<DstBuildT>::is_special, "Invalid destination type!");
+    auto &srcLeaf = nodeAcc->template srcNode<0>(blockIdx.x);
+    auto &dstLeaf = nodeAcc->template dstNode<DstBuildT,0>(blockIdx.x);
+    if (threadIdx.x == 0 && threadIdx.y == 0) {
+        dstLeaf.mBBoxMin = srcLeaf.mBBoxMin;
+        for (int i=0; i<3; ++i) dstLeaf.mBBoxDif[i] = srcLeaf.mBBoxDif[i];
+        dstLeaf.mFlags = srcLeaf.mFlags;
+        dstLeaf.mValueMask = srcLeaf.mValueMask;
+        ///
+        auto &srcGrid = nodeAcc->srcGrid();
+        if (srcGrid.hasMinMax()) {
+            dstLeaf.mMinimum = srcValues[srcLeaf.getMin()];
+            dstLeaf.mMaximum = srcValues[srcLeaf.getMax()];
+        }
+        if constexpr(util::is_same<SrcValueT, DstStatsT>::value) {// e.g. {float,float} or {Vec3f,float}
+            if (srcGrid.hasAverage())      dstLeaf.mAverage = srcValues[srcLeaf.getAvg()];
+            if (srcGrid.hasStdDeviation()) dstLeaf.mStdDevi = srcValues[srcLeaf.getDev()];
+        }
+    }
+    const int off = blockDim.x*blockDim.y*threadIdx.x + blockDim.x*threadIdx.y;
+    auto *dst = dstLeaf.mValues + off;
+    for (int threadIdx_z=0; threadIdx_z<blockDim.x; ++threadIdx_z) {
+        const int i = off + threadIdx_z;
+        *dst++ = srcValues[srcLeaf.getValue(i)];
+    }
+}// processLeafsKernel
+
+//================================================================================================
+
+template <typename SrcBuildT>
+__global__ void cpyNodeCountKernel(const NanoGrid<SrcBuildT> *srcGrid,
+                                   typename IndexToGrid<SrcBuildT>::NodeAccessor *nodeAcc)
+{
+    assert(srcGrid->isSequential());
+    nodeAcc->d_srcGrid = srcGrid;
+    for (int i=0; i<3; ++i) nodeAcc->nodeCount[i] = srcGrid->tree().nodeCount(i);
+    nodeAcc->nodeCount[3] = srcGrid->tree().root().tileCount();
+}
+
+}// anonymous namespace
+
+//================================================================================================
+
+template <typename SrcBuildT>
+IndexToGrid<SrcBuildT>::IndexToGrid(const SrcGridT *d_srcGrid, cudaStream_t stream)
+    : mStream(stream), mTimer(stream)
+{
+    NANOVDB_ASSERT(d_srcGrid);
+    cudaCheck(util::cuda::mallocAsync((void**)&mDevNodeAcc, sizeof(NodeAccessor), mStream));
+    cpyNodeCountKernel<SrcBuildT><<<1, 1, 0, mStream>>>(d_srcGrid, mDevNodeAcc);
+    cudaCheckError();
+    cudaCheck(cudaMemcpyAsync(&mNodeAcc, mDevNodeAcc, sizeof(NodeAccessor), cudaMemcpyDeviceToHost, mStream));// mNodeAcc = *mDevNodeAcc
+}
+
+//================================================================================================
+
+template <typename SrcBuildT>
+template <typename DstBuildT, typename BufferT>
+GridHandle<BufferT> IndexToGrid<SrcBuildT>::getHandle(const typename BuildToValueMap<DstBuildT>::type *srcValues,
+                                                          const BufferT &pool)
+{
+    if (mVerbose) mTimer.start("Initiate buffer");
+    auto buffer = this->template getBuffer<DstBuildT, BufferT>(pool);
+
+    if (mVerbose) mTimer.restart("Process grid,tree,root");
+    processGridTreeRootKernel<SrcBuildT,DstBuildT><<<1, 1, 0, mStream>>>(mDevNodeAcc, srcValues);
+    cudaCheckError();
+
+    if (mVerbose) mTimer.restart("Process root children and tiles");
+    processRootTilesKernel<SrcBuildT,DstBuildT><<<mNodeAcc.nodeCount[3], 1, 0, mStream>>>(mDevNodeAcc, srcValues);
+    cudaCheckError();
+
+    cudaCheck(util::cuda::freeAsync(mNodeAcc.d_gridName, mStream));
+
+    if (mVerbose) mTimer.restart("Process upper internal nodes");
+    processNodesKernel<SrcBuildT,DstBuildT,2><<<mNodeAcc.nodeCount[2], dim3(32,32), 0, mStream>>>(mDevNodeAcc, srcValues);
+    cudaCheckError();
+
+    if (mVerbose) mTimer.restart("Process lower internal nodes");
+    processNodesKernel<SrcBuildT,DstBuildT,1><<<mNodeAcc.nodeCount[1], dim3(16,16), 0, mStream>>>(mDevNodeAcc, srcValues);
+    cudaCheckError();
+
+    if (mVerbose) mTimer.restart("Process leaf nodes");
+    processLeafsKernel<SrcBuildT,DstBuildT><<<mNodeAcc.nodeCount[0], dim3(8,8), 0, mStream>>>(mDevNodeAcc, srcValues);
+    if (mVerbose) mTimer.stop();
+    cudaCheckError();
+
+    if (mVerbose) mTimer.restart("Compute checksums");
+    updateChecksum((GridData*)mNodeAcc.d_dstPtr, mStream);
+    if (mVerbose) mTimer.stop();
+
+    //cudaStreamSynchronize(mStream);// finish all device tasks in mStream
+    return GridHandle<BufferT>(std::move(buffer));
+}// IndexToGrid::getHandle
+
+//================================================================================================
+
+template <typename SrcBuildT>
+template <typename DstBuildT, typename BufferT>
+inline BufferT IndexToGrid<SrcBuildT>::getBuffer(const BufferT &pool)
+{
+    mNodeAcc.grid  = 0;// grid is always stored at the start of the buffer!
+    mNodeAcc.tree  = NanoGrid<DstBuildT>::memUsage(); // grid ends and tree begins
+    mNodeAcc.root  = mNodeAcc.tree  + NanoTree<DstBuildT>::memUsage(); // tree ends and root node begins
+    mNodeAcc.node[2] = mNodeAcc.root  + NanoRoot<DstBuildT>::memUsage(mNodeAcc.nodeCount[3]); // root node ends and upper internal nodes begin
+    mNodeAcc.node[1] = mNodeAcc.node[2] + NanoUpper<DstBuildT>::memUsage()*mNodeAcc.nodeCount[2]; // upper internal nodes ends and lower internal nodes begin
+    mNodeAcc.node[0] = mNodeAcc.node[1] + NanoLower<DstBuildT>::memUsage()*mNodeAcc.nodeCount[1]; // lower internal nodes ends and leaf nodes begin
+    mNodeAcc.meta  = mNodeAcc.node[0]  + NanoLeaf<DstBuildT>::DataType::memUsage()*mNodeAcc.nodeCount[0];// leaf nodes end and blind meta data begins
+    mNodeAcc.blind = mNodeAcc.meta  + 0*sizeof(GridBlindMetaData); // meta data ends and blind data begins
+    mNodeAcc.size  = mNodeAcc.blind;// end of buffer
+    auto buffer = BufferT::create(mNodeAcc.size, &pool, false, mStream);
+    mNodeAcc.d_dstPtr = buffer.deviceData();
+    if (mNodeAcc.d_dstPtr == nullptr) throw std::runtime_error("Failed memory allocation on the device");
+
+    if (size_t size = mGridName.size()) {
+        cudaCheck(util::cuda::mallocAsync((void**)&mNodeAcc.d_gridName, size, mStream));
+        cudaCheck(cudaMemcpyAsync(mNodeAcc.d_gridName, mGridName.data(), size, cudaMemcpyHostToDevice, mStream));
+    } else {
+        mNodeAcc.d_gridName = nullptr;
+    }
+    cudaCheck(cudaMemcpyAsync(mDevNodeAcc, &mNodeAcc, sizeof(NodeAccessor), cudaMemcpyHostToDevice, mStream));// copy NodeAccessor CPU -> GPU
+    return buffer;
+}
+
+//================================================================================================
+
+template<typename DstBuildT, typename SrcBuildT, typename BufferT>
+typename util::enable_if<BuildTraits<SrcBuildT>::is_index, GridHandle<BufferT>>::type
+indexToGrid(const NanoGrid<SrcBuildT> *d_srcGrid, const typename BuildToValueMap<DstBuildT>::type *d_srcValues, const BufferT &pool, cudaStream_t stream)
+{
+    IndexToGrid<SrcBuildT> converter(d_srcGrid, stream);
+    return converter.template getHandle<DstBuildT>(d_srcValues, pool);
+}
+
+}// namespace tools::cuda  =============================================================
+
+template<typename DstBuildT, typename SrcBuildT, typename BufferT = cuda::DeviceBuffer>
+[[deprecated("Use nanovdb::cuda::indexToGrid instead")]]
+typename util::enable_if<BuildTraits<SrcBuildT>::is_index, GridHandle<BufferT>>::type
+cudaIndexToGrid(const NanoGrid<SrcBuildT> *d_srcGrid, const typename BuildToValueMap<DstBuildT>::type *d_srcValues, const BufferT &pool = BufferT(), cudaStream_t stream = 0)
+{
+    return tools::cuda::indexToGrid<DstBuildT, SrcBuildT, BufferT>(d_srcGrid, d_srcValues, pool, stream);
+}
+
+
+template<typename DstBuildT, typename SrcBuildT, typename BufferT = cuda::DeviceBuffer>
+[[deprecated("Use nanovdb::cuda::indexToGrid instead")]]
+typename util::enable_if<BuildTraits<SrcBuildT>::is_index, GridHandle<BufferT>>::type
+cudaCreateNanoGrid(const NanoGrid<SrcBuildT> *d_srcGrid, const typename BuildToValueMap<DstBuildT>::type *d_srcValues, const BufferT &pool = BufferT(), cudaStream_t stream = 0)
+{
+    return tools::cuda::indexToGrid<DstBuildT, SrcBuildT, BufferT>(d_srcGrid, d_srcValues, pool, stream);
+}
+
+}// nanovdb namespace ===================================================================
+
+#endif // NVIDIA_TOOLS_CUDA_INDEXTOGRID_CUH_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/tools/cuda/PointsToGrid.cuh b/nanovdb/nanovdb/tools/cuda/PointsToGrid.cuh
new file mode 100644
index 0000000000..70134b5e7b
--- /dev/null
+++ b/nanovdb/nanovdb/tools/cuda/PointsToGrid.cuh
@@ -0,0 +1,1293 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    \file nanovdb/tools/cuda/PointsToGrid.cuh
+
+    \authors Greg Klar (initial version) and Ken Museth (final version)
+
+    \brief Generates NanoVDB grids from a list of voxels or points on the device
+
+    \warning The header file contains cuda device code so be sure
+             to only include it in .cu files (or other .cuh files)
+*/
+
+#ifndef NVIDIA_TOOLS_CUDA_POINTSTOGRID_CUH_HAS_BEEN_INCLUDED
+#define NVIDIA_TOOLS_CUDA_POINTSTOGRID_CUH_HAS_BEEN_INCLUDED
+
+#include <cub/cub.cuh>
+#include <cub/util_allocator.cuh>
+#include <vector>
+#include <tuple>
+
+#include <nanovdb/NanoVDB.h>
+#include <nanovdb/cuda/DeviceBuffer.h>
+#include <nanovdb/GridHandle.h>
+#include <nanovdb/tools/cuda/GridChecksum.cuh>
+#include <nanovdb/util/cuda/Timer.h>
+#include <nanovdb/util/cuda/Util.h>
+
+/*
+   Note: 4.29 billion (=2^32) coordinates of type Vec3f have a memory footprint of 48 GB!
+*/
+
+namespace nanovdb {// ================================================================================
+
+namespace tools::cuda {// ============================================================================
+
+/// @brief Generates a NanoGrid<Point> from a list of point coordinates on the device. This method is
+///        mainly used as a means to build a BVH acceleration structure for points, e.g. for efficient rendering.
+/// @tparam PtrT Template type to a raw or fancy-pointer of point coordinates in world space. Dereferencing should return Vec3f or Vec3d.
+/// @tparam BufferT Template type of buffer used for memory allocation on the device
+/// @tparam AllocT  Template type of optional device allocator for internal temporary memory
+/// @param dWorldPoints Raw or fancy pointer to list of point coordinates in world space on the device
+/// @param pointCount number of point in the list @c d_world
+/// @param voxelSize Size of a voxel in world units used for the output grid
+/// @param type Defined the way point information is represented in the output grid (see PointType enum NanoVDB.h)
+///             Should not be PointType::Disable!
+/// @param buffer Instance of the device buffer used for memory allocation
+/// @param stream optional CUDA stream (defaults to CUDA stream 0)
+/// @return Returns a handle with a grid of type NanoGrid<Point> where point information, e.g. coordinates,
+///         are represented as blind data defined by @c type.
+template<typename PtrT, typename BufferT = nanovdb::cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+GridHandle<BufferT>
+pointsToGrid(const PtrT dWorldPoints,
+             int pointCount,
+             double voxelSize,
+             PointType type = PointType::Default,
+             const BufferT &buffer = BufferT(),
+             cudaStream_t stream = 0);
+
+//-----------------------------------------------------------------------------------------------------
+
+/// @brief Generates a NanoGrid<Point> from a list of point coordinates on the device. This method is
+///        mainly used as a means to build a BVH acceleration structure for points, e.g. for efficient rendering.
+/// @tparam PtrT Template type to a raw or fancy-pointer of point coordinates in world space. Dereferencing should return Vec3f or Vec3d.
+/// @tparam BufferT Template type of buffer used for memory allocation on the device
+/// @tparam AllocT  Template type of optional device allocator for internal temporary memory
+/// @param dWorldPoints Raw or fancy pointer to list of point coordinates in world space on the device
+/// @param pointCount total number of point in the list @c d_world
+/// @param maxPointsPerVoxel Max density of points per voxel, i.e. maximum number of points in any voxel
+/// @param tolerance allow for point density to vary by the specified tolerance (defaults to 1). That is, the voxel size
+///                  is selected such that the max density is +/- the tolerance.
+/// @param maxIterations Maximum number of iterations used to seach for a voxel size that produces a point density
+///                      with specified tolerance takes.
+/// @param type Defined the way point information is represented in the output grid (see PointType enum in NanoVDB.h)
+///             Should not be PointType::Disable!
+/// @param buffer Instance of the device buffer used for memory allocation
+/// @param stream optional CUDA stream (defaults to CUDA stream 0)
+/// @return Returns a handle with a grid of type NanoGrid<Point> where point information, e.g. coordinates,
+///         are represented as blind data defined by @c type.
+template<typename PtrT, typename BufferT = nanovdb::cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+GridHandle<BufferT>
+pointsToGrid(const PtrT dWorldPoints,
+             int pointCount,
+             int maxPointPerVoxel,
+             int tolerance = 1,
+             int maxIterations = 10,
+             PointType type = PointType::Default,
+             const BufferT &buffer = BufferT(),
+             cudaStream_t stream = 0);
+
+//-----------------------------------------------------------------------------------------------------
+
+template<typename BuildT, typename PtrT, typename BufferT = nanovdb::cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+GridHandle<BufferT>
+pointsToGrid(std::vector<std::tuple<const PtrT,size_t,double,PointType>> pointSet,
+            const BufferT &buffer = BufferT(),
+            cudaStream_t stream = 0);
+
+//-----------------------------------------------------------------------------------------------------
+
+/// @brief Generates a NanoGrid of any type from a list of voxel coordinates on the device. Unlike @c cudaPointsToGrid
+///        this method only builds the grid but does not encode the coordinates as blind data. It is mainly useful as a
+///        means to generate a grid that is know to contain the voxels given in the list.
+/// @tparam BuildT Template type of the return grid
+/// @tparam PtrT Template type to a raw or fancy-pointer of point coordinates in world space. Dereferencing should return Vec3f or Vec3d.
+/// @tparam BufferT Template type of buffer used for memory allocation on the device
+/// @tparam AllocT  Template type of optional device allocator for internal temporary memory
+/// @param dGridVoxels Raw or fancy pointer to list of voxel coordinates in grid (or index) space on the device
+/// @param pointCount number of voxel in the list @c dGridVoxels
+/// @param voxelSize Size of a voxel in world units used for the output grid
+/// @param buffer Instance of the device buffer used for memory allocation
+/// @return Returns a handle with the grid of type NanoGrid<BuildT>
+template<typename BuildT, typename PtrT, typename BufferT = nanovdb::cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+GridHandle<BufferT>
+voxelsToGrid(const PtrT dGridVoxels,
+             size_t voxelCount,
+             double voxelSize = 1.0,
+             const BufferT &buffer = BufferT(),
+             cudaStream_t stream = 0);
+
+//-------------------------------------------------------------------------------------------------------
+
+template<typename BuildT, typename PtrT, typename BufferT = nanovdb::cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+GridHandle<BufferT>
+voxelsToGrid(std::vector<std::tuple<const PtrT, size_t, double>> pointSet,
+             const BufferT &buffer = BufferT(),
+             cudaStream_t stream = 0);
+
+}// namespace tools::cuda ========================================================================
+
+/// @brief Example class of a fancy pointer that can optionally be used as a template for writing
+///        a custom fancy pointer that allows for particle coordinates to be arrange non-linearly
+///        in memory. For instance with coordinates are interlaced with other dats, i.e. an array
+///        of structs, a custom implementation of fancy_ptr::operator[](size_t i) can account for
+///        strides that skip other interlaces data.
+/// @tparam T Template type that specifies the type use for the coordinates of the points
+template <typename T>
+class fancy_ptr
+{
+    const T* mPtr;
+public:
+    /// @brief Default constructor.
+    /// @note  This method is atcually not required by cuda::PointsToGrid
+    /// @param ptr Pointer to array of elements
+    __hostdev__ explicit fancy_ptr(const T* ptr = nullptr) : mPtr(ptr) {}
+    /// @brief Index acces into the array pointed to by the stored pointer.
+    /// @note  This method is required by cuda::PointsToGrid!
+    /// @param i Unsigned index of the element to be returned
+    /// @return Const refernce to the element at the i'th poisiton
+    __hostdev__ inline const T& operator[](size_t i) const {return mPtr[i];}
+    /// @brief Dummy implementation required by pointer_traits.
+    /// @note  Note that only the return type matters!
+    /// @details Unlike operator[] it is safe to assume that all pointer types have operator*,
+    ///          which is why pointer_traits makes use of it to determine the element_type that
+    ///          a pointer class is pointing to. E.g. operator[] is not always defined for std::shared_ptr!
+    __hostdev__ inline const T& operator*() const {return *mPtr;}
+};// fancy_ptr<T>
+
+/// @brief Simple stand-alone function that can be used to conveniently construct a fancy_ptr
+/// @tparam T Template type that specifies the type use for the coordinates of the points
+/// @param ptr Raw pointer to data
+/// @return a new instance of a fancy_ptr
+template <typename T>
+fancy_ptr<T> make_fancy(const T* ptr = nullptr) {return fancy_ptr<T>(ptr);}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+/// @brief Trait of points, like type of pointer and size of the pointer type
+template <typename>
+struct pointer_traits;
+
+template <typename T>
+struct pointer_traits<T*> {
+    using element_type = T;
+    static constexpr size_t element_size = sizeof(T);
+};
+
+template <typename T>
+struct pointer_traits {
+    using element_type = typename util::remove_reference<decltype(*util::declval<T>())>::type;// assumes T::operator*() exists!
+    static constexpr size_t element_size = sizeof(element_type);
+};
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+/// @brief computes the relative 8-bit voxel offsets from a world coordinate
+/// @tparam Vec3T Type of the world coordinate
+/// @param voxel 8-bit output coordinates that are relative to a voxel
+/// @param world input world coordinates
+/// @param indexToWorld Transform from index to world space
+template <typename Vec3T>
+__hostdev__ inline static void worldToVoxel(Vec3u8 &voxel, const Vec3T &world, const Map &indexToWorld)
+{
+    const Vec3d ijk = indexToWorld.applyInverseMap(world);// world -> index
+    static constexpr double encode = double((1<<8) - 1);
+    voxel[0] = uint8_t( encode*(ijk[0] - math::Floor(ijk[0] + 0.5) + 0.5) );
+    voxel[1] = uint8_t( encode*(ijk[1] - math::Floor(ijk[1] + 0.5) + 0.5) );
+    voxel[2] = uint8_t( encode*(ijk[2] - math::Floor(ijk[2] + 0.5) + 0.5) );
+}
+
+/// @brief computes the relative 16-bit voxel offsets from a world coordinate
+/// @tparam Vec3T Type of the world coordinate
+/// @param voxel 16-bit output coordinates that are relative to a voxel
+/// @param world input world coordinates
+/// @param indexToWorld Transform from index to world space
+template <typename Vec3T>
+__hostdev__ inline static void worldToVoxel(Vec3u16 &voxel, const Vec3T &world, const Map &indexToWorld)
+{
+    const Vec3d ijk = indexToWorld.applyInverseMap(world);// world -> index
+    static constexpr double encode = double((1<<16) - 1);
+    voxel[0] = uint16_t( encode*(ijk[0] - math::Floor(ijk[0] + 0.5) + 0.5) );
+    voxel[1] = uint16_t( encode*(ijk[1] - math::Floor(ijk[1] + 0.5) + 0.5) );
+    voxel[2] = uint16_t( encode*(ijk[2] - math::Floor(ijk[2] + 0.5) + 0.5) );
+}
+
+/// @brief computes the relative float voxel offsets from a world coordinate
+/// @tparam Vec3T Type of the world coordinate
+/// @param voxel float output coordinates that are relative to a voxel
+/// @param world input world coordinates
+/// @param indexToWorld Transform from index to world space
+template <typename Vec3T>
+__hostdev__ inline static void worldToVoxel(Vec3f &voxel, const Vec3T &world, const Map &indexToWorld)
+{
+    const Vec3d ijk = indexToWorld.applyInverseMap(world);// world -> index
+    voxel[0] = float( ijk[0] - math::Floor(ijk[0] + 0.5) );
+    voxel[1] = float( ijk[1] - math::Floor(ijk[1] + 0.5) );
+    voxel[2] = float( ijk[2] - math::Floor(ijk[2] + 0.5) );
+}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template <typename Vec3T = Vec3d>
+__hostdev__ inline static Vec3T voxelToWorld(const Vec3u8 &voxel, const Coord &ijk, const Map &map)
+{
+    static constexpr double decode = 1.0/double((1<<8) - 1);
+    if constexpr(util::is_same<Vec3T,Vec3d>::value) {
+        return map.applyMap( Vec3d(ijk[0] + decode*voxel[0] - 0.5, ijk[1] + decode*voxel[1] - 0.5, ijk[2] + decode*voxel[2] - 0.5));
+    } else {
+        return map.applyMapF(Vec3f(ijk[0] + decode*voxel[0] - 0.5f, ijk[1] + decode*voxel[1] - 0.5f, ijk[2] + decode*voxel[2] - 0.5f));
+    }
+}
+
+template <typename Vec3T = Vec3d>
+__hostdev__ inline static Vec3T voxelToWorld(const Vec3u16 &voxel, const Coord &ijk, const Map &map)
+{
+    static constexpr double decode = 1.0/double((1<<16) - 1);
+    if constexpr(util::is_same<Vec3T,Vec3d>::value) {
+        return map.applyMap( Vec3d(ijk[0] + decode*voxel[0] - 0.5, ijk[1] + decode*voxel[1] - 0.5, ijk[2] + decode*voxel[2] - 0.5));
+    } else {
+        return map.applyMapF(Vec3f(ijk[0] + decode*voxel[0] - 0.5f, ijk[1] + decode*voxel[1] - 0.5f, ijk[2] + decode*voxel[2] - 0.5f));
+    }
+}
+
+template <typename Vec3T = Vec3d>
+__hostdev__ inline static Vec3T voxelToWorld(const Vec3f &voxel, const Coord &ijk, const Map &map)
+{
+    if constexpr(util::is_same<Vec3T,Vec3d>::value) {
+        return map.applyMap( Vec3d(ijk[0] + voxel[0], ijk[1] + voxel[1], ijk[2] + voxel[2]));
+    } else {
+        return map.applyMapF(Vec3f(ijk[0] + voxel[0], ijk[1] + voxel[1], ijk[2] + voxel[2]));
+    }
+}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+namespace tools::cuda {
+
+template <typename BuildT, typename AllocT = cub::CachingDeviceAllocator>
+class PointsToGrid
+{
+public:
+
+    struct Data {
+        Map map;
+        void     *d_bufferPtr;
+        uint64_t *d_keys, *d_tile_keys, *d_lower_keys, *d_leaf_keys;// device pointer to 64 bit keys
+        uint64_t  grid, tree, root, upper, lower, leaf, meta, blind, size;// byte offsets to nodes in buffer
+        uint32_t *d_indx;// device pointer to point indices (or IDs)
+        uint32_t  nodeCount[3], *pointsPerLeafPrefix, *pointsPerLeaf;// 0=leaf,1=lower, 2=upper
+        uint32_t  voxelCount,  *pointsPerVoxelPrefix, *pointsPerVoxel;
+        BitFlags<16> flags;
+        __hostdev__ NanoGrid<BuildT>&  getGrid() const {return *util::PtrAdd<NanoGrid<BuildT>>(d_bufferPtr, grid);}
+        __hostdev__ NanoTree<BuildT>&  getTree() const {return *util::PtrAdd<NanoTree<BuildT>>(d_bufferPtr, tree);}
+        __hostdev__ NanoRoot<BuildT>&  getRoot() const {return *util::PtrAdd<NanoRoot<BuildT>>(d_bufferPtr, root);}
+        __hostdev__ NanoUpper<BuildT>& getUpper(int i) const {return *(util::PtrAdd<NanoUpper<BuildT>>(d_bufferPtr, upper)+i);}
+        __hostdev__ NanoLower<BuildT>& getLower(int i) const {return *(util::PtrAdd<NanoLower<BuildT>>(d_bufferPtr, lower)+i);}
+        __hostdev__ NanoLeaf<BuildT>&  getLeaf(int i) const {return *(util::PtrAdd<NanoLeaf<BuildT>>(d_bufferPtr, leaf)+i);}
+        __hostdev__ GridBlindMetaData& getMeta() const { return *util::PtrAdd<GridBlindMetaData>(d_bufferPtr, meta);};
+         template <typename Vec3T>
+        __hostdev__ Vec3T& getPoint(int i) const {return *(util::PtrAdd<Vec3T>(d_bufferPtr, blind)+i);}
+    };// Data
+
+    /// @brief Map constructor, which other constructors might call
+    /// @param map Map to be used for the output device grid
+    /// @param stream optional CUDA stream (defaults to CUDA stream 0)
+    PointsToGrid(const Map &map, cudaStream_t stream = 0)
+        : mStream(stream)
+        , mPointType(util::is_same<BuildT,Point>::value ? PointType::Default : PointType::Disable)
+    {
+        mData.map = map;
+        mData.flags.initMask({GridFlags::HasBBox, GridFlags::IsBreadthFirst});
+        mDeviceData = mMemPool.template alloc<Data>(mStream);
+    }
+
+    /// @brief Default constructor that calls the Map constructor defined above
+    /// @param scale Voxel size in world units
+    /// @param trans Translation of origin in world units
+    /// @param stream optional CUDA stream (defaults to CUDA stream 0)
+    PointsToGrid(const double scale = 1.0, const Vec3d &trans = Vec3d(0.0), cudaStream_t stream = 0)
+        : PointsToGrid(Map(scale, trans), stream){}
+
+    /// @brief Constructor from a target maximum number of particles per voxel. Calls the Map constructor defined above
+    /// @param maxPointsPerVoxel Maximum number of points oer voxel
+    /// @param stream optional CUDA stream (defaults to CUDA stream 0)
+    PointsToGrid(int maxPointsPerVoxel, int tolerance = 1, int maxIterations = 10, cudaStream_t stream = 0)
+        : PointsToGrid(Map(1.0), stream)
+    {
+        mMaxPointsPerVoxel = maxPointsPerVoxel;
+        mTolerance = tolerance;
+        mMaxIterations = maxIterations;
+    }
+
+    /// @brief Toggle on and off verbose mode
+    /// @param level Verbose level: 0=quiet, 1=timing, 2=benchmarking
+    void setVerbose(int level = 1) {mVerbose = level; mData.flags.setBit(7u, level); }
+
+    /// @brief Set the mode for checksum computation, which is disabled by default
+    /// @param mode Mode of checksum computation
+    void setChecksum(CheckMode mode = CheckMode::Disable){mChecksum = mode;}
+
+    /// @brief Toggle on and off the computation of a bounding-box
+    /// @param on If true bbox will be computed
+    void includeBBox(bool on = true) { mData.flags.setMask(GridFlags::HasBBox, on); }
+
+    /// @brief Set the name of the output grid
+    /// @param name name of the output grid
+    void setGridName(const std::string &name) {mGridName = name;}
+
+    // only available when BuildT == Point
+    template <typename T = BuildT> typename util::enable_if<util::is_same<T, Point>::value>::type
+    setPointType(PointType type) { mPointType = type; }
+
+    /// @brief Creates a handle to a grid with the specified build type from a list of points in index or world space
+    /// @tparam BuildT Build type of the output grid, i.e NanoGrid<BuildT>
+    /// @tparam PtrT Template type to a raw or fancy-pointer of point coordinates in world or index space.
+    /// @tparam BufferT Buffer type used for allocation of the grid handle
+    /// @param points device point to an array of points in world space
+    /// @param pointCount number of input points or voxels
+    /// @param buffer optional buffer (currently ignored)
+    /// @return returns a handle with a grid of type NanoGrid<BuildT>
+    template<typename PtrT, typename BufferT = nanovdb::cuda::DeviceBuffer>
+    GridHandle<BufferT> getHandle(const PtrT points,
+                                  size_t pointCount,
+                                  const BufferT &buffer = BufferT());
+
+    template <typename PtrT>
+    void countNodes(const PtrT points, size_t pointCount);
+
+    template <typename PtrT>
+    void processGridTreeRoot(const PtrT points, size_t pointCount);
+
+    void processUpperNodes();
+
+    void processLowerNodes();
+
+    template <typename PtrT>
+    void processLeafNodes(const PtrT points);
+
+    template <typename PtrT>
+    void processPoints(const PtrT points, size_t pointCount);
+
+    void processBBox();
+
+    // the following methods are only defined when BuildT == Point
+    template <typename T = BuildT> typename util::enable_if<util::is_same<T, Point>::value, uint32_t>::type
+    maxPointsPerVoxel() const {return mMaxPointsPerVoxel;}
+    template <typename T = BuildT> typename util::enable_if<util::is_same<T, Point>::value, uint32_t>::type
+    maxPointsPerLeaf()  const {return mMaxPointsPerLeaf;}
+
+private:
+    static constexpr unsigned int mNumThreads = 128;// seems faster than the old value of 256!
+    static unsigned int numBlocks(unsigned int n) {return (n + mNumThreads - 1) / mNumThreads;}
+
+    cudaStream_t      mStream{0};
+    util::cuda::Timer mTimer;
+    PointType         mPointType;
+    std::string       mGridName;
+    int               mVerbose{0};
+    Data              mData, *mDeviceData;
+    uint32_t          mMaxPointsPerVoxel{0u}, mMaxPointsPerLeaf{0u};
+    int               mTolerance{1}, mMaxIterations{1};
+    CheckMode         mChecksum{CheckMode::Disable};
+
+    // wrapper of AllocT, defaulting to cub::CachingDeviceAllocator, which offers a shared scratch space
+    struct Allocator {
+        AllocT mAllocator;
+        void* d_scratch;
+        size_t scratchSize, actualScratchSize;
+        Allocator() : d_scratch(nullptr), scratchSize(0), actualScratchSize(0) {}
+        ~Allocator() {
+            if (scratchSize > 0) this->free(d_scratch);// a bug in cub makes this necessary
+            mAllocator.FreeAllCached();
+        }
+        template <typename T>
+        T* alloc(size_t count, cudaStream_t stream) {
+            T* d_ptr = nullptr;
+            cudaCheck(mAllocator.DeviceAllocate((void**)&d_ptr, sizeof(T)*count, stream));
+            return d_ptr;
+        }
+        template <typename T>
+        T* alloc(cudaStream_t stream) {return this->template alloc<T>(1, stream);}
+        void free(void *d_ptr) {if (d_ptr) cudaCheck(mAllocator.DeviceFree(d_ptr));}
+        template<class... T>
+        void free(void *d_ptr, T... other) {
+            if (d_ptr) cudaCheck(mAllocator.DeviceFree(d_ptr));
+            this->free(other...);
+        }
+        void adjustScratch(cudaStream_t stream){
+            if (scratchSize > actualScratchSize) {
+                if (actualScratchSize>0) cudaCheck(mAllocator.DeviceFree(d_scratch));
+                cudaCheck(mAllocator.DeviceAllocate((void**)&d_scratch, scratchSize, stream));
+                actualScratchSize = scratchSize;
+            }
+        }
+    } mMemPool;
+
+    template<typename PtrT, typename BufferT>
+    BufferT getBuffer(const PtrT points, size_t pointCount, const BufferT &buffer);
+};// tools::cuda::PointsToGrid<BuildT>
+
+namespace kernels {
+/// @details Used by cuda::PointsToGrid<BuildT>::processLeafNodes before the computation
+/// of prefix-sum for index grid.
+/// Moving this away from an implementation using the lambdaKernel wrapper
+/// to fix the following on Windows platform:
+/// error : For this host platform/dialect, an extended lambda cannot be defined inside the 'if'
+/// or 'else' block of a constexpr if statement.
+/// function in a lambda through lambdaKernel wrapper defined in CudaUtils.h.
+template <typename BuildT, typename AllocT = cub::CachingDeviceAllocator>
+__global__ void fillValueIndexKernel(const size_t numItems, uint64_t* devValueIndex, typename PointsToGrid<BuildT, AllocT>::Data* d_data) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= numItems) return;
+    devValueIndex[tid] = static_cast<uint64_t>(d_data->getLeaf(tid).mValueMask.countOn());
+}
+
+/// @details Used by PointsToGrid<BuildT>::processLeafNodes for the computation
+/// of prefix-sum for index grid.
+/// Moving this away from an implementation using the lambdaKernel wrapper
+/// to fix the following on Windows platform:
+/// error : For this host platform/dialect, an extended lambda cannot be defined inside the 'if'
+/// or 'else' block of a constexpr if statement.
+template <typename BuildT, typename AllocT = cub::CachingDeviceAllocator>
+__global__ void leafPrefixSumKernel(const size_t numItems, uint64_t* devValueIndexPrefix, typename PointsToGrid<BuildT, AllocT>::Data* d_data) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= numItems) return;
+
+    auto &leaf = d_data->getLeaf(tid);
+    leaf.mOffset = 1u;// will be re-set below
+    const uint64_t *w = leaf.mValueMask.words();
+    uint64_t &prefixSum = leaf.mPrefixSum, sum = util::countOn(*w++);
+    prefixSum = sum;
+    for (int n = 9; n < 55; n += 9) {// n=i*9 where i=1,2,..6
+        sum += util::countOn(*w++);
+        prefixSum |= sum << n;// each pre-fixed sum is encoded in 9 bits
+    }
+    if (tid==0) {
+        d_data->getGrid().mData1 = 1u + devValueIndexPrefix[d_data->nodeCount[0]-1];// set total count
+        d_data->getTree().mVoxelCount = devValueIndexPrefix[d_data->nodeCount[0]-1];
+    } else {
+        leaf.mOffset = 1u + devValueIndexPrefix[tid-1];// background is index 0
+    }
+}
+
+/// @details Used by PointsToGrid<BuildT>::processLeafNodes to make sure leaf.mMask - leaf.mValueMask.
+/// Moving this away from an implementation using the lambdaKernel wrapper
+/// to fix the following on Windows platform:
+/// error : For this host platform/dialect, an extended lambda cannot be defined inside the 'if'
+/// or 'else' block of a constexpr if statement.
+template <typename BuildT, typename AllocT = cub::CachingDeviceAllocator>
+__global__ void setMaskEqValMaskKernel(const size_t numItems, typename PointsToGrid<BuildT, AllocT>::Data* d_data) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= numItems) return;
+    auto &leaf = d_data->getLeaf(tid);
+    leaf.mMask = leaf.mValueMask;
+}
+} // namespace kernels
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+// Define utility macro used to call cub functions that use dynamic temporary storage
+#ifndef CALL_CUBS
+#ifdef _WIN32
+#define CALL_CUBS(func, ...) \
+    cudaCheck(cub::func(nullptr, mMemPool.scratchSize, __VA_ARGS__, mStream)); \
+    mMemPool.adjustScratch(mStream); \
+    cudaCheck(cub::func(mMemPool.d_scratch, mMemPool.scratchSize, __VA_ARGS__, mStream));
+#else// fdef _WIN32
+#define CALL_CUBS(func, args...) \
+    cudaCheck(cub::func(nullptr, mMemPool.scratchSize, args, mStream)); \
+    mMemPool.adjustScratch(mStream); \
+    cudaCheck(cub::func(mMemPool.d_scratch, mMemPool.scratchSize, args, mStream));
+#endif// ifdef _WIN32
+#endif// ifndef CALL_CUBS
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template<typename BuildT, typename AllocT>
+template<typename PtrT, typename BufferT>
+inline GridHandle<BufferT>
+PointsToGrid<BuildT, AllocT>::getHandle(const PtrT points,
+                                        size_t pointCount,
+                                        const BufferT &pool)
+{
+    if (mVerbose==1) mTimer.start("\nCounting nodes");
+    this->countNodes(points, pointCount);
+
+    if (mVerbose==1) mTimer.restart("Initiate buffer");
+    auto buffer = this->getBuffer(points, pointCount, pool);
+
+    if (mVerbose==1) mTimer.restart("Process grid,tree,root");
+    this->processGridTreeRoot(points, pointCount);
+
+    if (mVerbose==1) mTimer.restart("Process upper nodes");
+    this->processUpperNodes();
+
+    if (mVerbose==1) mTimer.restart("Process lower nodes");
+    this->processLowerNodes();
+
+    if (mVerbose==1) mTimer.restart("Process leaf nodes");
+    this->processLeafNodes(points);
+
+    if (mVerbose==1) mTimer.restart("Process points");
+    this->processPoints(points, pointCount);
+
+    if (mVerbose==1) mTimer.restart("Process bbox");
+    this->processBBox();
+    if (mVerbose==1) mTimer.stop();
+
+    if (mVerbose==1) mTimer.restart("Computation of checksum");
+    tools::cuda::updateChecksum((GridData*)buffer.deviceData(), mChecksum);
+    if (mVerbose==1) mTimer.stop();
+
+    return GridHandle<BufferT>(std::move(buffer));
+}// PointsToGrid<BuildT>::getHandle
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+// --- CUB helpers ---
+template<uint8_t BitCount, typename InT, typename OutT>
+struct ShiftRight
+{
+    __hostdev__ inline OutT operator()(const InT& v) const {return static_cast<OutT>(v >> BitCount);}
+};
+
+template<uint8_t BitCount, typename InT = uint64_t, typename OutT = uint64_t>
+struct ShiftRightIterator : public cub::TransformInputIterator<OutT, ShiftRight<BitCount, InT, OutT>, InT*>
+{
+    using BASE = cub::TransformInputIterator<OutT, ShiftRight<BitCount, InT, OutT>, InT*>;
+    __hostdev__ inline ShiftRightIterator(uint64_t* input_itr) : BASE(input_itr, ShiftRight<BitCount, InT, OutT>()) {}
+};
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template <typename BuildT, typename AllocT>
+template <typename PtrT>
+void PointsToGrid<BuildT, AllocT>::countNodes(const PtrT points, size_t pointCount)
+{
+    using Vec3T = typename util::remove_const<typename pointer_traits<PtrT>::element_type>::type;
+    if constexpr(util::is_same<BuildT, Point>::value) {
+        static_assert(util::is_same<Vec3T, Vec3f, Vec3d>::value, "Point (vs voxels) coordinates should be represented as Vec3f or Vec3d");
+    } else {
+        static_assert(util::is_same<Vec3T, Coord, Vec3f, Vec3d>::value, "Voxel coordinates should be represented as Coord, Vec3f or Vec3d");
+    }
+
+    mMaxPointsPerVoxel = math::Min(mMaxPointsPerVoxel, pointCount);
+    int iterCounter = 0;
+    struct Foo {// pairs current voxel size, dx, with the corresponding particle density, i.e. maximum number of points per voxel
+        double   dx;
+        uint32_t density;
+        bool operator<(const Foo &rhs) const {return density < rhs.density || (density == rhs.density && dx < rhs.dx);}
+    } min{0.0, 1}, max{0.0, 0};// min: as dx -> 0 density -> 1 point per voxel, max: density is 0 i.e. undefined
+
+jump:// this marks the beginning of the actual algorithm
+
+    mData.d_keys = mMemPool.template alloc<uint64_t>(pointCount, mStream);
+    mData.d_indx = mMemPool.template alloc<uint32_t>(pointCount, mStream);// uint32_t can index 4.29 billion Coords, corresponding to 48 GB
+    cudaCheck(cudaMemcpyAsync(mDeviceData, &mData, sizeof(Data), cudaMemcpyHostToDevice, mStream));// copy mData from CPU -> GPU
+
+    if (mVerbose==2) mTimer.start("\nAllocating arrays for keys and indices");
+    auto *d_keys = mMemPool.template alloc<uint64_t>(pointCount, mStream);
+    auto *d_indx = mMemPool.template alloc<uint32_t>(pointCount, mStream);
+
+    if (mVerbose==2) mTimer.restart("Generate tile keys");
+    util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, const Data *d_data, const PtrT points) {
+        auto coordToKey = [](const Coord &ijk)->uint64_t{
+            // Note: int32_t has a range of -2^31 to 2^31 - 1 whereas uint32_t has a range of 0 to 2^32 - 1
+            static constexpr int64_t offset = 1 << 31;
+            return (uint64_t(uint32_t(int64_t(ijk[2]) + offset) >> 12)      ) | // z is the lower 21 bits
+                   (uint64_t(uint32_t(int64_t(ijk[1]) + offset) >> 12) << 21) | // y is the middle 21 bits
+                   (uint64_t(uint32_t(int64_t(ijk[0]) + offset) >> 12) << 42); //  x is the upper 21 bits
+        };// coordToKey lambda functor
+        d_indx[tid] = uint32_t(tid);
+        uint64_t &key = d_keys[tid];
+        if constexpr(util::is_same<BuildT, Point>::value) {// points are in world space
+            if constexpr(util::is_same<Vec3T, Vec3f>::value) {
+                key = coordToKey(d_data->map.applyInverseMapF(points[tid]).round());
+            } else {// points are Vec3d
+                key = coordToKey(d_data->map.applyInverseMap(points[tid]).round());
+            }
+        } else if constexpr(util::is_same<Vec3T, Coord>::value) {// points Coord are in index space
+            key = coordToKey(points[tid]);
+        } else {// points are Vec3f or Vec3d in index space
+            key = coordToKey(points[tid].round());
+        }
+    }, mDeviceData, points);
+    cudaCheckError();
+    if (mVerbose==2) mTimer.restart("DeviceRadixSort of "+std::to_string(pointCount)+" tile keys");
+    CALL_CUBS(DeviceRadixSort::SortPairs, d_keys, mData.d_keys, d_indx, mData.d_indx, pointCount, 0, 62);// 21 bits per coord
+    std::swap(d_indx, mData.d_indx);// sorted indices are now in d_indx
+
+    if (mVerbose==2) mTimer.restart("Allocate runs");
+    auto *d_points_per_tile = mMemPool.template alloc<uint32_t>(pointCount, mStream);
+    uint32_t *d_node_count  = mMemPool.template alloc<uint32_t>(3, mStream);
+
+    if (mVerbose==2) mTimer.restart("DeviceRunLengthEncode tile keys");
+    CALL_CUBS(DeviceRunLengthEncode::Encode, mData.d_keys, d_keys, d_points_per_tile, d_node_count+2, pointCount);
+    cudaCheck(cudaMemcpyAsync(mData.nodeCount+2, d_node_count+2, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
+    mData.d_tile_keys = mMemPool.template alloc<uint64_t>(mData.nodeCount[2], mStream);
+    cudaCheck(cudaMemcpyAsync(mData.d_tile_keys, d_keys, mData.nodeCount[2]*sizeof(uint64_t), cudaMemcpyDeviceToDevice, mStream));
+
+    if (mVerbose) mTimer.restart("DeviceRadixSort of " + std::to_string(pointCount) + " voxel keys in " + std::to_string(mData.nodeCount[2]) + " tiles");
+    uint32_t *points_per_tile = new uint32_t[mData.nodeCount[2]];
+    cudaCheck(cudaMemcpyAsync(points_per_tile, d_points_per_tile, mData.nodeCount[2]*sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
+    mMemPool.free(d_points_per_tile);
+
+    for (uint32_t id = 0, offset = 0; id < mData.nodeCount[2]; ++id) {
+        const uint32_t count = points_per_tile[id];
+        util::cuda::lambdaKernel<<<numBlocks(count), mNumThreads, 0, mStream>>>(count, [=] __device__(size_t tid, const Data *d_data) {
+            auto voxelKey = [] __device__ (uint64_t tileID, const Coord &ijk){
+                return tileID << 36 |                                       // upper offset: 64-15-12-9=28, i.e. last 28 bits
+                    uint64_t(NanoUpper<BuildT>::CoordToOffset(ijk)) << 21 | // lower offset: 32^3 = 2^15,   i.e. next 15 bits
+                    uint64_t(NanoLower<BuildT>::CoordToOffset(ijk)) <<  9 | // leaf  offset: 16^3 = 2^12,   i.e. next 12 bits
+                    uint64_t(NanoLeaf< BuildT>::CoordToOffset(ijk));        // voxel offset:  8^3 =  2^9,   i.e. first 9 bits
+            };// voxelKey lambda functor
+            tid += offset;
+            Vec3T p = points[d_indx[tid]];
+            if constexpr(util::is_same<BuildT, Point>::value) p = util::is_same<Vec3T, Vec3f>::value ? d_data->map.applyInverseMapF(p) : d_data->map.applyInverseMap(p);
+            d_keys[tid] = voxelKey(id, p.round());
+        }, mDeviceData); cudaCheckError();
+        CALL_CUBS(DeviceRadixSort::SortPairs, d_keys + offset, mData.d_keys + offset, d_indx + offset, mData.d_indx + offset, count, 0, 36);// 9+12+15=36
+        offset += count;
+    }
+    mMemPool.free(d_indx);
+    delete [] points_per_tile;
+
+    if (mVerbose==2) mTimer.restart("Count points per voxel");
+
+    mData.pointsPerVoxel    = mMemPool.template alloc<uint32_t>(pointCount, mStream);
+    uint32_t *d_voxel_count = mMemPool.template alloc<uint32_t>(mStream);
+    CALL_CUBS(DeviceRunLengthEncode::Encode, mData.d_keys, d_keys, mData.pointsPerVoxel, d_voxel_count, pointCount);
+    cudaCheck(cudaMemcpyAsync(&mData.voxelCount, d_voxel_count, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
+    mMemPool.free(d_voxel_count);
+
+    if (util::is_same<BuildT, Point>::value) {
+        if (mVerbose==2) mTimer.restart("Count max points per voxel");
+        uint32_t *d_maxPointsPerVoxel = mMemPool.template alloc<uint32_t>(mStream), maxPointsPerVoxel;
+        CALL_CUBS(DeviceReduce::Max, mData.pointsPerVoxel, d_maxPointsPerVoxel, mData.voxelCount);
+        cudaCheck(cudaMemcpyAsync(&maxPointsPerVoxel, d_maxPointsPerVoxel, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
+        mMemPool.free(d_maxPointsPerVoxel);
+        double dx = mData.map.getVoxelSize()[0];
+        if (++iterCounter >= mMaxIterations || pointCount == 1u || math::Abs((int)maxPointsPerVoxel - (int)mMaxPointsPerVoxel) <= mTolerance) {
+            mMaxPointsPerVoxel = maxPointsPerVoxel;
+        } else {
+            const Foo tmp{dx, maxPointsPerVoxel};
+            if (maxPointsPerVoxel < mMaxPointsPerVoxel) {
+                if (min < tmp) min = tmp;
+            } else if (max.density == 0 || tmp < max) {
+                max = tmp;
+            }
+            if (max.density) {
+                dx = (min.dx*(max.density - mMaxPointsPerVoxel) + max.dx*(mMaxPointsPerVoxel-min.density))/double(max.density-min.density);
+            } else if (maxPointsPerVoxel > 1u) {
+                dx *= (mMaxPointsPerVoxel-1.0)/(maxPointsPerVoxel-1.0);
+            } else {// maxPointsPerVoxel = 1 so increase dx significantly
+                dx *= 10.0;
+            }
+            if (mVerbose==2) printf("\ntarget density = %u, current density = %u current dx = %f, next dx = %f\n", mMaxPointsPerVoxel, maxPointsPerVoxel, tmp.dx, dx);
+            mData.map = Map(dx);
+            mMemPool.free(mData.d_keys, mData.d_indx, d_keys, mData.d_tile_keys, d_node_count, mData.pointsPerVoxel);
+            goto jump;
+        }
+    }
+    if (iterCounter>1 && mVerbose) std::cerr << "Used " << iterCounter << " attempts to determine dx that produces a target dpoint denisty\n\n";
+
+    if (mVerbose==2) mTimer.restart("Compute prefix sum of points per voxel");
+    mData.pointsPerVoxelPrefix = mMemPool.template alloc<uint32_t>(mData.voxelCount, mStream);
+    CALL_CUBS(DeviceScan::ExclusiveSum, mData.pointsPerVoxel, mData.pointsPerVoxelPrefix, mData.voxelCount);
+
+    mData.pointsPerLeaf = mMemPool.template alloc<uint32_t>(pointCount, mStream);
+    CALL_CUBS(DeviceRunLengthEncode::Encode, ShiftRightIterator<9>(mData.d_keys), d_keys, mData.pointsPerLeaf, d_node_count, pointCount);
+    cudaCheck(cudaMemcpyAsync(mData.nodeCount, d_node_count, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
+
+    if constexpr(util::is_same<BuildT, Point>::value) {
+        uint32_t *d_maxPointsPerLeaf = mMemPool.template alloc<uint32_t>(mStream);
+        CALL_CUBS(DeviceReduce::Max, mData.pointsPerLeaf, d_maxPointsPerLeaf, mData.nodeCount[0]);
+        cudaCheck(cudaMemcpyAsync(&mMaxPointsPerLeaf, d_maxPointsPerLeaf, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
+        //printf("\n Leaf count = %u, max points per leaf = %u\n", mData.nodeCount[0], mMaxPointsPerLeaf);
+        if (mMaxPointsPerLeaf > std::numeric_limits<uint16_t>::max()) {
+            throw std::runtime_error("Too many points per leaf: "+std::to_string(mMaxPointsPerLeaf));
+        }
+        mMemPool.free(d_maxPointsPerLeaf);
+    }
+
+    mData.pointsPerLeafPrefix = mMemPool.template alloc<uint32_t>(mData.nodeCount[0], mStream);
+    CALL_CUBS(DeviceScan::ExclusiveSum, mData.pointsPerLeaf, mData.pointsPerLeafPrefix, mData.nodeCount[0]);
+
+    mData.d_leaf_keys = mMemPool.template alloc<uint64_t>(mData.nodeCount[0], mStream);
+    cudaCheck(cudaMemcpyAsync(mData.d_leaf_keys, d_keys, mData.nodeCount[0]*sizeof(uint64_t), cudaMemcpyDeviceToDevice, mStream));
+
+    CALL_CUBS(DeviceSelect::Unique, ShiftRightIterator<12>(mData.d_leaf_keys), d_keys, d_node_count+1, mData.nodeCount[0]);// count lower nodes
+    cudaCheck(cudaMemcpyAsync(mData.nodeCount+1, d_node_count+1, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
+    mData.d_lower_keys = mMemPool.template alloc<uint64_t>(mData.nodeCount[1], mStream);
+    cudaCheck(cudaMemcpyAsync(mData.d_lower_keys, d_keys, mData.nodeCount[1]*sizeof(uint64_t), cudaMemcpyDeviceToDevice, mStream));
+
+    mMemPool.free(d_keys, d_node_count);
+    if (mVerbose==2) mTimer.stop();
+
+    //printf("Leaf count = %u, lower count = %u, upper count = %u\n", mData.nodeCount[0], mData.nodeCount[1], mData.nodeCount[2]);
+}// PointsToGrid<BuildT>::countNodes
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template <typename BuildT, typename AllocT>
+template <typename PtrT, typename BufferT>
+inline BufferT PointsToGrid<BuildT, AllocT>::getBuffer(const PtrT, size_t pointCount, const BufferT &pool)
+{
+    auto sizeofPoint = [&]()->size_t{
+        switch (mPointType){
+        case PointType::PointID: return sizeof(uint32_t);
+        case PointType::World64: return sizeof(Vec3d);
+        case PointType::World32: return sizeof(Vec3f);
+        case PointType::Grid64:  return sizeof(Vec3d);
+        case PointType::Grid32:  return sizeof(Vec3f);
+        case PointType::Voxel32: return sizeof(Vec3f);
+        case PointType::Voxel16: return sizeof(Vec3u16);
+        case PointType::Voxel8:  return sizeof(Vec3u8);
+        case PointType::Default: return pointer_traits<PtrT>::element_size;
+        default: return size_t(0);// PointType::Disable
+        }
+    };
+
+    mData.grid  = 0;// grid is always stored at the start of the buffer!
+    mData.tree  = NanoGrid<BuildT>::memUsage(); // grid ends and tree begins
+    mData.root  = mData.tree  + NanoTree<BuildT>::memUsage(); // tree ends and root node begins
+    mData.upper = mData.root  + NanoRoot<BuildT>::memUsage(mData.nodeCount[2]); // root node ends and upper internal nodes begin
+    mData.lower = mData.upper + NanoUpper<BuildT>::memUsage()*mData.nodeCount[2]; // upper internal nodes ends and lower internal nodes begin
+    mData.leaf  = mData.lower + NanoLower<BuildT>::memUsage()*mData.nodeCount[1]; // lower internal nodes ends and leaf nodes begin
+    mData.meta  = mData.leaf  + NanoLeaf<BuildT>::DataType::memUsage()*mData.nodeCount[0];// leaf nodes end and blind meta data begins
+    mData.blind = mData.meta  + sizeof(GridBlindMetaData)*int( mPointType!=PointType::Disable ); // meta data ends and blind data begins
+    mData.size  = mData.blind + pointCount*sizeofPoint();// end of buffer
+
+    auto buffer = BufferT::create(mData.size, &pool, false);// only allocate buffer on the device
+    mData.d_bufferPtr = buffer.deviceData();
+    if (mData.d_bufferPtr == nullptr) throw std::runtime_error("Failed to allocate grid buffer on the device");
+    cudaCheck(cudaMemcpyAsync(mDeviceData, &mData, sizeof(Data), cudaMemcpyHostToDevice, mStream));// copy Data CPU -> GPU
+    return buffer;
+}// PointsToGrid<BuildT>::getBuffer
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template <typename BuildT, typename AllocT>
+template <typename PtrT>
+inline void PointsToGrid<BuildT, AllocT>::processGridTreeRoot(const PtrT points, size_t pointCount)
+{
+    using Vec3T = typename util::remove_const<typename pointer_traits<PtrT>::element_type>::type;
+    util::cuda::lambdaKernel<<<1, 1, 0, mStream>>>(1, [=] __device__(size_t, Data *d_data, PointType pointType) {
+       // process Root
+        auto &root = d_data->getRoot();
+        root.mBBox = CoordBBox(); // init to empty
+        root.mTableSize = d_data->nodeCount[2];
+        root.mBackground = NanoRoot<BuildT>::ValueType(0);// background_value
+        root.mMinimum = root.mMaximum = NanoRoot<BuildT>::ValueType(0);
+        root.mAverage = root.mStdDevi = NanoRoot<BuildT>::FloatType(0);
+
+        // process Tree
+        auto &tree = d_data->getTree();
+        tree.setRoot(&root);
+        tree.setFirstNode(&d_data->getUpper(0));
+        tree.setFirstNode(&d_data->getLower(0));
+        tree.setFirstNode(&d_data->getLeaf(0));
+        tree.mNodeCount[2] = tree.mTileCount[2] = d_data->nodeCount[2];
+        tree.mNodeCount[1] = tree.mTileCount[1] = d_data->nodeCount[1];
+        tree.mNodeCount[0] = tree.mTileCount[0] = d_data->nodeCount[0];
+        tree.mVoxelCount = d_data->voxelCount;
+
+        // process Grid
+        auto &grid = d_data->getGrid();
+        grid.init({GridFlags::HasBBox, GridFlags::IsBreadthFirst}, d_data->size, d_data->map, toGridType<BuildT>());
+        grid.mChecksum = ~uint64_t(0);// set all bits on which means it's disabled
+        grid.mBlindMetadataCount  = util::is_same<BuildT, Point>::value;// ? 1u : 0u;
+        grid.mBlindMetadataOffset = d_data->meta;
+        if (pointType != PointType::Disable) {
+            const auto lastLeaf = tree.mNodeCount[0] - 1;
+            grid.mData1 = d_data->pointsPerLeafPrefix[lastLeaf] + d_data->pointsPerLeaf[lastLeaf];
+            auto &meta = d_data->getMeta();
+            meta.mDataOffset = sizeof(GridBlindMetaData);// blind data is placed right after this meta data
+            meta.mValueCount = pointCount;
+            // Blind meta data
+            switch (pointType){
+            case PointType::PointID:
+                grid.mGridClass = GridClass::PointIndex;
+                meta.mSemantic  = GridBlindDataSemantic::PointId;
+                meta.mDataClass = GridBlindDataClass::IndexArray;
+                meta.mDataType  = toGridType<uint32_t>();
+                meta.mValueSize = sizeof(uint32_t);
+                util::strcpy(meta.mName, "PointID: uint32_t indices to points");
+                break;
+            case PointType::World64:
+                grid.mGridClass = GridClass::PointData;
+                meta.mSemantic  = GridBlindDataSemantic::WorldCoords;
+                meta.mDataClass = GridBlindDataClass::AttributeArray;
+                meta.mDataType  = toGridType<Vec3d>();
+                meta.mValueSize = sizeof(Vec3d);
+                util::strcpy(meta.mName, "World64: Vec3<double> point coordinates in world space");
+                break;
+            case PointType::World32:
+                grid.mGridClass = GridClass::PointData;
+                meta.mSemantic  = GridBlindDataSemantic::WorldCoords;
+                meta.mDataClass = GridBlindDataClass::AttributeArray;
+                meta.mDataType  = toGridType<Vec3f>();
+                meta.mValueSize = sizeof(Vec3f);
+                util::strcpy(meta.mName, "World32: Vec3<float> point coordinates in world space");
+                break;
+            case PointType::Grid64:
+                grid.mGridClass = GridClass::PointData;
+                meta.mSemantic  = GridBlindDataSemantic::GridCoords;
+                meta.mDataClass = GridBlindDataClass::AttributeArray;
+                meta.mDataType  = toGridType<Vec3d>();
+                meta.mValueSize = sizeof(Vec3d);
+                util::strcpy(meta.mName, "Grid64: Vec3<double> point coordinates in grid space");
+                break;
+            case PointType::Grid32:
+                grid.mGridClass = GridClass::PointData;
+                meta.mSemantic  = GridBlindDataSemantic::GridCoords;
+                meta.mDataClass = GridBlindDataClass::AttributeArray;
+                meta.mDataType  = toGridType<Vec3f>();
+                meta.mValueSize = sizeof(Vec3f);
+                util::strcpy(meta.mName, "Grid32: Vec3<float> point coordinates in grid space");
+                break;
+            case PointType::Voxel32:
+                grid.mGridClass = GridClass::PointData;
+                meta.mSemantic  = GridBlindDataSemantic::VoxelCoords;
+                meta.mDataClass = GridBlindDataClass::AttributeArray;
+                meta.mDataType  = toGridType<Vec3f>();
+                meta.mValueSize = sizeof(Vec3f);
+                util::strcpy(meta.mName, "Voxel32: Vec3<float> point coordinates in voxel space");
+                break;
+            case PointType::Voxel16:
+                grid.mGridClass = GridClass::PointData;
+                meta.mSemantic  = GridBlindDataSemantic::VoxelCoords;
+                meta.mDataClass = GridBlindDataClass::AttributeArray;
+                meta.mDataType  = toGridType<Vec3u16>();
+                meta.mValueSize = sizeof(Vec3u16);
+                util::strcpy(meta.mName, "Voxel16: Vec3<uint16_t> point coordinates in voxel space");
+                break;
+            case PointType::Voxel8:
+                grid.mGridClass = GridClass::PointData;
+                meta.mSemantic  = GridBlindDataSemantic::VoxelCoords;
+                meta.mDataClass = GridBlindDataClass::AttributeArray;
+                meta.mDataType  = toGridType<Vec3u8>();
+                meta.mValueSize = sizeof(Vec3u8);
+                util::strcpy(meta.mName, "Voxel8: Vec3<uint8_t> point coordinates in voxel space");
+                break;
+            case PointType::Default:
+                grid.mGridClass = GridClass::PointData;
+                meta.mSemantic  = GridBlindDataSemantic::WorldCoords;
+                meta.mDataClass = GridBlindDataClass::AttributeArray;
+                meta.mDataType  = toGridType<Vec3T>();
+                meta.mValueSize = sizeof(Vec3T);
+                if constexpr(util::is_same<Vec3T, Vec3f>::value) {
+                    util::strcpy(meta.mName, "World32: Vec3<float> point coordinates in world space");
+                } else if constexpr(util::is_same<Vec3T, Vec3d>::value){
+                    util::strcpy(meta.mName, "World64: Vec3<double> point coordinates in world space");
+                } else {
+                    printf("Error in PointsToGrid<BuildT>::processGridTreeRoot: expected Vec3T = Vec3f or Vec3d\n");
+                }
+                break;
+            default:
+                printf("Error in PointsToGrid<BuildT>::processGridTreeRoot: invalid pointType\n");
+            }
+        } else if constexpr(BuildTraits<BuildT>::is_offindex) {
+            grid.mData1 = 1u + 512u*d_data->nodeCount[0];
+            grid.mGridClass = GridClass::IndexGrid;
+        }
+    }, mDeviceData, mPointType);// lambdaKernel
+    cudaCheckError();
+
+    char *dst = mData.getGrid().mGridName;
+    if (const char *src = mGridName.data()) {
+        cudaCheck(cudaMemcpyAsync(dst, src, GridData::MaxNameSize, cudaMemcpyHostToDevice, mStream));
+    } else {
+        cudaCheck(cudaMemsetAsync(dst, 0, GridData::MaxNameSize, mStream));
+    }
+}// PointsToGrid<BuildT>::processGridTreeRoot
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template <typename BuildT, typename AllocT>
+inline void PointsToGrid<BuildT, AllocT>::processUpperNodes()
+{
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[2]), mNumThreads, 0, mStream>>>(mData.nodeCount[2], [=] __device__(size_t tid, Data *d_data) {
+        auto &root  = d_data->getRoot();
+        auto &upper = d_data->getUpper(tid);
+#if 1
+        auto keyToCoord = [](uint64_t key)->nanovdb::Coord{
+            static constexpr int64_t offset = 1 << 31;// max values of uint32_t is 2^31 - 1
+            static constexpr uint64_t MASK = (1u << 21) - 1; // used to mask out 21 lower bits
+            return nanovdb::Coord(int(int64_t(((key >> 42) & MASK) << 12) - offset),  // x are the upper 21 bits
+                                  int(int64_t(((key >> 21) & MASK) << 12) - offset),  // y are the middle 21 bits
+                                  int(int64_t(( key        & MASK) << 12) - offset)); // z are the lower 21 bits
+        };
+        const Coord ijk = keyToCoord(d_data->d_tile_keys[tid]);
+#else
+        const Coord ijk = NanoRoot<uint32_t>::KeyToCoord(d_data->d_tile_keys[tid]);
+#endif
+        root.tile(tid)->setChild(ijk, &upper, &root);
+        upper.mBBox[0] = ijk;
+        upper.mFlags = 0;
+        upper.mValueMask.setOff();
+        upper.mChildMask.setOff();
+        upper.mMinimum = upper.mMaximum = NanoLower<BuildT>::ValueType(0);
+        upper.mAverage = upper.mStdDevi = NanoLower<BuildT>::FloatType(0);
+    }, mDeviceData);
+    cudaCheckError();
+
+    mMemPool.free(mData.d_tile_keys);
+
+    const uint64_t valueCount = mData.nodeCount[2] << 15;
+    util::cuda::lambdaKernel<<<numBlocks(valueCount), mNumThreads, 0, mStream>>>(valueCount, [=] __device__(size_t tid, Data *d_data) {
+        auto &upper = d_data->getUpper(tid >> 15);
+        upper.mTable[tid & 32767u].value = NanoUpper<BuildT>::ValueType(0);// background
+    }, mDeviceData);
+    cudaCheckError();
+}// PointsToGrid<BuildT>::processUpperNodes
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template <typename BuildT, typename AllocT>
+inline void PointsToGrid<BuildT, AllocT>::processLowerNodes()
+{
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[1]), mNumThreads, 0, mStream>>>(mData.nodeCount[1], [=] __device__(size_t tid, Data *d_data) {
+        auto &root  = d_data->getRoot();
+        const uint64_t lowerKey = d_data->d_lower_keys[tid];
+        auto &upper = d_data->getUpper(lowerKey >> 15);
+        const uint32_t upperOffset = lowerKey & 32767u;// (1 << 15) - 1 = 32767
+        upper.mChildMask.setOnAtomic(upperOffset);
+        auto &lower = d_data->getLower(tid);
+        upper.setChild(upperOffset, &lower);
+        lower.mBBox[0] = upper.offsetToGlobalCoord(upperOffset);
+        lower.mFlags = 0;
+        lower.mValueMask.setOff();
+        lower.mChildMask.setOff();
+        lower.mMinimum = lower.mMaximum = NanoLower<BuildT>::ValueType(0);// background;
+        lower.mAverage = lower.mStdDevi = NanoLower<BuildT>::FloatType(0);
+    }, mDeviceData);
+    cudaCheckError();
+
+    const uint64_t valueCount = mData.nodeCount[1] << 12;
+    util::cuda::lambdaKernel<<<numBlocks(valueCount), mNumThreads, 0, mStream>>>(valueCount, [=] __device__(size_t tid, Data *d_data) {
+        auto &lower = d_data->getLower(tid >> 12);
+        lower.mTable[tid & 4095u].value = NanoLower<BuildT>::ValueType(0);// background
+    }, mDeviceData);
+    cudaCheckError();
+}// PointsToGrid<BuildT>::processLowerNodes
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template <typename BuildT, typename AllocT>
+template <typename PtrT>
+inline void PointsToGrid<BuildT, AllocT>::processLeafNodes(const PtrT points)
+{
+    const uint8_t flags = static_cast<uint8_t>(mData.flags.data());// mIncludeStats ? 16u : 0u;// 4th bit indicates stats
+
+    if (mVerbose==2) mTimer.start("process leaf meta data");
+    // loop over leaf nodes and add it to its parent node
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], [=] __device__(size_t tid, Data *d_data) {
+        const uint64_t leafKey = d_data->d_leaf_keys[tid], tile_id = leafKey >> 27;
+        auto &upper = d_data->getUpper(tile_id);
+        const uint32_t lowerOffset = leafKey & 4095u, upperOffset = (leafKey >> 12) & 32767u;
+        auto &lower = *upper.getChild(upperOffset);
+        lower.mChildMask.setOnAtomic(lowerOffset);
+        auto &leaf = d_data->getLeaf(tid);
+        lower.setChild(lowerOffset, &leaf);
+        leaf.mBBoxMin = lower.offsetToGlobalCoord(lowerOffset);
+        leaf.mFlags = flags;
+        auto &valueMask = leaf.mValueMask;
+        valueMask.setOff();// initiate all bits to off
+
+        if constexpr(util::is_same<Point, BuildT>::value) {
+            leaf.mOffset = d_data->pointsPerLeafPrefix[tid];
+            leaf.mPointCount = d_data->pointsPerLeaf[tid];
+        } else if constexpr(BuildTraits<BuildT>::is_offindex) {
+            leaf.mOffset = tid*512u + 1u;// background is index 0
+            leaf.mPrefixSum = 0u;
+        } else if constexpr(!BuildTraits<BuildT>::is_special) {
+            leaf.mAverage = leaf.mStdDevi = NanoLeaf<BuildT>::FloatType(0);
+            leaf.mMinimum = leaf.mMaximum = NanoLeaf<BuildT>::ValueType(0);
+        }
+    }, mDeviceData); cudaCheckError();
+
+    if (mVerbose==2) mTimer.restart("set active voxel state and values");
+    // loop over all active voxels and set LeafNode::mValueMask and LeafNode::mValues
+    util::cuda::lambdaKernel<<<numBlocks(mData.voxelCount), mNumThreads, 0, mStream>>>(mData.voxelCount, [=] __device__(size_t tid, Data *d_data) {
+        const uint32_t pointID  = d_data->pointsPerVoxelPrefix[tid];
+        const uint64_t voxelKey = d_data->d_keys[pointID];
+        auto &upper = d_data->getUpper(voxelKey >> 36);
+        auto &lower = *upper.getChild((voxelKey >> 21) & 32767u);
+        auto &leaf  = *lower.getChild((voxelKey >>  9) &  4095u);
+        const uint32_t n = voxelKey & 511u;
+        leaf.mValueMask.setOnAtomic(n);// <--- slow!
+        if constexpr(util::is_same<Point, BuildT>::value) {
+            leaf.mValues[n] = uint16_t(pointID + d_data->pointsPerVoxel[tid] - leaf.offset());
+        } else if constexpr(!BuildTraits<BuildT>::is_special) {
+            leaf.mValues[n] = NanoLeaf<BuildT>::ValueType(1);// set value of active voxels that are not points (or index)
+        }
+    }, mDeviceData); cudaCheckError();
+
+    mMemPool.free(mData.d_keys, mData.pointsPerVoxel, mData.pointsPerVoxelPrefix, mData.pointsPerLeafPrefix, mData.pointsPerLeaf);
+
+    if (mVerbose==2) mTimer.restart("set inactive voxel values");
+    const uint64_t denseVoxelCount = mData.nodeCount[0] << 9;
+    util::cuda::lambdaKernel<<<numBlocks(denseVoxelCount), mNumThreads, 0, mStream>>>(denseVoxelCount, [=] __device__(size_t tid, Data *d_data) {
+        auto &leaf = d_data->getLeaf(tid >> 9u);
+        const uint32_t n = tid & 511u;
+        if (leaf.mValueMask.isOn(n)) return;
+        if constexpr(util::is_same<BuildT, Point>::value) {
+            const uint32_t m = leaf.mValueMask.findPrev<true>(n - 1);
+            leaf.mValues[n] = m < 512u ? leaf.mValues[m] : 0u;
+        } else if constexpr(!BuildTraits<BuildT>::is_special) {
+            leaf.mValues[n] = NanoLeaf<BuildT>::ValueType(0);// value of inactive voxels
+        }
+    }, mDeviceData); cudaCheckError();
+
+    if constexpr(BuildTraits<BuildT>::is_onindex) {
+        if (mVerbose==2) mTimer.restart("prefix-sum for index grid");
+        uint64_t *devValueIndex = mMemPool.template alloc<uint64_t>(mData.nodeCount[0], mStream);
+        auto devValueIndexPrefix = mMemPool.template alloc<uint64_t>(mData.nodeCount[0], mStream);
+        kernels::fillValueIndexKernel<BuildT, AllocT><<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], devValueIndex, mDeviceData);
+        cudaCheckError();
+        CALL_CUBS(DeviceScan::InclusiveSum, devValueIndex, devValueIndexPrefix, mData.nodeCount[0]);
+        mMemPool.free(devValueIndex);
+        kernels::leafPrefixSumKernel<BuildT, AllocT><<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], devValueIndexPrefix, mDeviceData);
+        cudaCheckError();
+        mMemPool.free(devValueIndexPrefix);
+    }
+
+    if constexpr(BuildTraits<BuildT>::is_indexmask) {
+        if (mVerbose==2) mTimer.restart("leaf.mMask = leaf.mValueMask");
+        kernels::setMaskEqValMaskKernel<BuildT, AllocT><<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], mDeviceData);
+        cudaCheckError();
+    }
+    if (mVerbose==2) mTimer.stop();
+}// PointsToGrid<BuildT>::processLeafNodes
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template <typename BuildT, typename AllocT>
+template <typename PtrT>
+inline void PointsToGrid<BuildT, AllocT>::processPoints(const PtrT, size_t)
+{
+    mMemPool.free(mData.d_indx);
+}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+// Template specialization with BuildT = Point
+template <>
+template <typename PtrT>
+inline void PointsToGrid<Point>::processPoints(const PtrT points, size_t pointCount)
+{
+    switch (mPointType){
+    case PointType::Disable:
+        throw std::runtime_error("PointsToGrid<Point>::processPoints: mPointType == PointType::Disable\n");
+    case PointType::PointID:
+        util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
+            d_data->template getPoint<uint32_t>(tid) = d_data->d_indx[tid];
+        }, mDeviceData); cudaCheckError();
+        break;
+    case PointType::World64:
+        util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
+            d_data->template getPoint<Vec3d>(tid) = points[d_data->d_indx[tid]];
+        }, mDeviceData); cudaCheckError();
+        break;
+    case PointType::World32:
+        util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
+            d_data->template getPoint<Vec3f>(tid) = points[d_data->d_indx[tid]];
+        }, mDeviceData); cudaCheckError();
+        break;
+    case PointType::Grid64:
+        util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
+            d_data->template getPoint<Vec3d>(tid) = d_data->map.applyInverseMap(points[d_data->d_indx[tid]]);
+        }, mDeviceData); cudaCheckError();
+        break;
+    case PointType::Grid32:
+        util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
+            d_data->template getPoint<Vec3f>(tid) = d_data->map.applyInverseMapF(points[d_data->d_indx[tid]]);
+        }, mDeviceData); cudaCheckError();
+        break;
+    case PointType::Voxel32:
+        util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
+            worldToVoxel(d_data->template getPoint<Vec3f>(tid), points[d_data->d_indx[tid]], d_data->map);
+        }, mDeviceData); cudaCheckError();
+        break;
+    case PointType::Voxel16:
+        util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
+            worldToVoxel(d_data->template getPoint<Vec3u16>(tid), points[d_data->d_indx[tid]], d_data->map);
+        }, mDeviceData); cudaCheckError();
+        break;
+    case PointType::Voxel8:
+        util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
+            worldToVoxel(d_data->template getPoint<Vec3u8>(tid), points[d_data->d_indx[tid]], d_data->map);
+        }, mDeviceData); cudaCheckError();
+        break;
+    case PointType::Default:
+        util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
+            d_data->template getPoint<typename pointer_traits<PtrT>::element_type>(tid) = points[d_data->d_indx[tid]];
+        }, mDeviceData); cudaCheckError();
+        break;
+    default:
+        printf("Internal error in PointsToGrid<Point>::processPoints\n");
+    }
+    mMemPool.free(mData.d_indx);
+}// PointsToGrid<Point>::processPoints
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template <typename BuildT, typename AllocT>
+inline void PointsToGrid<BuildT, AllocT>::processBBox()
+{
+    if (mData.flags.isMaskOff(GridFlags::HasBBox)) {
+        mMemPool.free(mData.d_leaf_keys, mData.d_lower_keys);
+        return;
+    }
+
+    // reset bbox in lower nodes
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[1]), mNumThreads, 0, mStream>>>(mData.nodeCount[1], [=] __device__(size_t tid, Data *d_data) {
+        d_data->getLower(tid).mBBox = CoordBBox();
+    }, mDeviceData);
+    cudaCheckError();
+
+    // update and propagate bbox from leaf -> lower/parent nodes
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], [=] __device__(size_t tid, Data *d_data) {
+        const uint64_t leafKey = d_data->d_leaf_keys[tid];
+        auto &upper = d_data->getUpper(leafKey >> 27);
+        auto &lower = *upper.getChild((leafKey >> 12) & 32767u);
+        auto &leaf = d_data->getLeaf(tid);
+        leaf.updateBBox();
+        lower.mBBox.expandAtomic(leaf.bbox());
+    }, mDeviceData);
+    mMemPool.free(mData.d_leaf_keys);
+    cudaCheckError();
+
+    // reset bbox in upper nodes
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[2]), mNumThreads, 0, mStream>>>(mData.nodeCount[2], [=] __device__(size_t tid, Data *d_data) {
+        d_data->getUpper(tid).mBBox = CoordBBox();
+    }, mDeviceData);
+    cudaCheckError();
+
+    // propagate bbox from lower -> upper/parent node
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[1]), mNumThreads, 0, mStream>>>(mData.nodeCount[1], [=] __device__(size_t tid, Data *d_data) {
+        const uint64_t lowerKey = d_data->d_lower_keys[tid];
+        auto &upper = d_data->getUpper(lowerKey >> 15);
+        auto &lower = d_data->getLower(tid);
+        upper.mBBox.expandAtomic(lower.bbox());
+    }, mDeviceData);
+    mMemPool.free(mData.d_lower_keys);
+    cudaCheckError()
+
+    // propagate bbox from upper -> root/parent node
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[2]), mNumThreads, 0, mStream>>>(mData.nodeCount[2], [=] __device__(size_t tid, Data *d_data) {
+        d_data->getRoot().mBBox.expandAtomic(d_data->getUpper(tid).bbox());
+    }, mDeviceData);
+    cudaCheckError();
+
+    // update the world-bbox in the root node
+    util::cuda::lambdaKernel<<<1, 1, 0, mStream>>>(1, [=] __device__(size_t, Data *d_data) {
+        d_data->getGrid().mWorldBBox = d_data->getRoot().mBBox.transform(d_data->map);
+    }, mDeviceData);
+    cudaCheckError();
+}// PointsToGrid<BuildT>::processBBox
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template<typename BuildT, typename PtrT, typename BufferT, typename AllocT>
+GridHandle<BufferT>// Grid<BuildT>
+voxelsToGrid(const PtrT d_ijk, size_t voxelCount, double voxelSize, const BufferT &buffer, cudaStream_t stream)
+{
+    PointsToGrid<BuildT, AllocT> converter(voxelSize, Vec3d(0.0), stream);
+    return converter.getHandle(d_ijk, voxelCount, buffer);
+}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template<typename PtrT, typename BufferT, typename AllocT>
+GridHandle<BufferT>// Grid<Point> with PointType coordinates as blind data
+pointsToGrid(const PtrT d_xyz, int pointCount, int maxPointsPerVoxel, int tolerance, int maxIterations, PointType type, const BufferT &buffer, cudaStream_t stream)
+{
+    PointsToGrid<Point, AllocT> converter(maxPointsPerVoxel, tolerance, maxIterations, Vec3d(0.0), stream);
+    converter.setPointType(type);
+    return converter.getHandle(d_xyz, pointCount, buffer);
+}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template<typename BuildT, typename PtrT, typename BufferT, typename AllocT>
+GridHandle<BufferT>
+pointsToGrid(std::vector<std::tuple<const PtrT,size_t,double,PointType>> vec, const BufferT &buffer, cudaStream_t stream)
+{
+    std::vector<GridHandle<BufferT>> handles;
+    for (auto &p : vec) handles.push_back(pointsToGrid<BuildT, AllocT>(std::get<0>(p), std::get<1>(p), std::get<2>(p), std::get<3>(p), buffer, stream));
+    return mergeDeviceGrids(handles, stream);
+}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template<typename BuildT, typename PtrT, typename BufferT, typename AllocT>
+GridHandle<BufferT>
+voxelsToGrid(std::vector<std::tuple<const PtrT,size_t,double>> vec, const BufferT &buffer, cudaStream_t stream)
+{
+    std::vector<GridHandle<BufferT>> handles;
+    for (auto &p : vec) handles.push_back(voxelsToGrid<BuildT, PtrT, BufferT, AllocT>(std::get<0>(p), std::get<1>(p), std::get<2>(p), buffer, stream));
+    return mergeDeviceGrids(handles, stream);
+}
+
+}// namespace tools::cuda ======================================================================================================================================
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template<typename PtrT, typename BufferT = cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+[[deprecated("Use cuda::pointsToGrid instead")]]
+GridHandle<BufferT>
+cudaPointsToGrid(const PtrT dWorldPoints,
+                 int pointCount,
+                 double voxelSize = 1.0,
+                 PointType type = PointType::Default,
+                 const BufferT &buffer = BufferT(),
+                 cudaStream_t stream = 0)
+{
+    return tools::cuda::pointsToGrid<PtrT, BufferT, AllocT>(dWorldPoints, pointCount, voxelSize, type, buffer, stream);
+}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template<typename BuildT, typename PtrT, typename BufferT = cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+[[deprecated("Use cuda::pointsToGrid instead")]]
+GridHandle<BufferT>
+cudaPointsToGrid(std::vector<std::tuple<const PtrT,size_t,double,PointType>> pointSet,
+                 const BufferT &buffer = BufferT(),
+                 cudaStream_t stream = 0)
+{
+    return tools::cuda::pointsToGrid<BuildT, PtrT, BufferT,AllocT>(pointSet, buffer, stream);
+}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template<typename BuildT, typename PtrT, typename BufferT = cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+[[deprecated("Use cuda::voxelsToGrid instead")]]
+GridHandle<BufferT>
+cudaVoxelsToGrid(const PtrT dGridVoxels,
+                 size_t voxelCount,
+                 double voxelSize = 1.0,
+                 const BufferT &buffer = BufferT(),
+                 cudaStream_t stream = 0)
+{
+    return tools::cuda::voxelsToGrid<BuildT, PtrT, BufferT, AllocT>(dGridVoxels, voxelCount, voxelSize, buffer, stream);
+}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template<typename BuildT, typename PtrT, typename BufferT = cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+[[deprecated("Use cuda::voxelsToGrid instead")]]
+GridHandle<BufferT>
+cudaVoxelsToGrid(std::vector<std::tuple<const PtrT, size_t, double>> pointSet,
+                 const BufferT &buffer = BufferT(),
+                 cudaStream_t stream = 0)
+{
+    return tools::cuda::voxelsToGrid<BuildT, PtrT, BufferT, AllocT>(pointSet, buffer, stream);
+}
+
+}// namespace nanovdb
+
+#endif // NVIDIA_TOOLS_CUDA_POINTSTOGRID_CUH_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/tools/cuda/SignedFloodFill.cuh b/nanovdb/nanovdb/tools/cuda/SignedFloodFill.cuh
new file mode 100644
index 0000000000..82aece2784
--- /dev/null
+++ b/nanovdb/nanovdb/tools/cuda/SignedFloodFill.cuh
@@ -0,0 +1,213 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    \file nanovdb/tools/cuda/SignedFloodFill.cuh
+
+    \author Ken Museth
+
+    \date May 3, 2023
+
+    \brief Performs signed flood-fill operation on the hierarchical tree structure on the device
+
+    \todo This tools needs to handle the (extremely) rare case when root node
+          needs to be modified during the signed flood fill operation. This happens
+          when the root-table needs to be expanded with tile values (of size 4096^3)
+          that are completely inside the implicit surface.
+
+    \warning The header file contains cuda device code so be sure
+             to only include it in .cu files (or other .cuh files)
+*/
+
+#ifndef NANOVDB_TOOLS_CUDA_SIGNEDFLOODFILL_CUH_HAS_BEEN_INCLUDED
+#define NANOVDB_TOOLS_CUDA_SIGNEDFLOODFILL_CUH_HAS_BEEN_INCLUDED
+
+#include <nanovdb/NanoVDB.h>
+#include <nanovdb/GridHandle.h>
+#include <nanovdb/util/cuda/Timer.h>
+#include <nanovdb/util/cuda/Util.h>
+#include <nanovdb/tools/cuda/GridChecksum.cuh>
+
+namespace nanovdb {
+
+namespace tools::cuda {
+
+/// @brief Performs signed flood-fill operation on the hierarchical tree structure on the device
+/// @tparam BuildT Build type of the grid to be flood-filled
+/// @param d_grid Non-const device pointer to the grid that will be flood-filled
+/// @param verbose If true timing information will be printed to the terminal
+/// @param stream optional cuda stream
+template<typename BuildT>
+typename util::enable_if<BuildTraits<BuildT>::is_float, void>::type
+signedFloodFill(NanoGrid<BuildT> *d_grid, bool verbose = false, cudaStream_t stream = 0);
+
+namespace {// anonymous namespace
+
+template<typename BuildT>
+class SignedFloodFill
+{
+public:
+    SignedFloodFill(bool verbose = false, cudaStream_t stream = 0)
+        : mStream(stream), mVerbose(verbose) {}
+
+    /// @brief Toggle on and off verbose mode
+    /// @param on if true verbose is turned on
+    void setVerbose(bool on = true) {mVerbose = on;}
+
+    void operator()(NanoGrid<BuildT> *d_grid);
+
+private:
+    cudaStream_t      mStream{0};
+    util::cuda::Timer mTimer;
+    bool              mVerbose{false};
+
+};// SignedFloodFill
+
+//================================================================================================
+
+template<typename BuildT>
+__global__ void processRootKernel(NanoTree<BuildT> *tree)
+{
+    // auto &root = tree->root();
+    /*
+    using ChildT = typename RootT::ChildNodeType;
+    // Insert the child nodes into a map sorted according to their origin
+    std::map<Coord, ChildT*> nodeKeys;
+    typename RootT::ChildOnIter it = root.beginChildOn();
+    for (; it; ++it) nodeKeys.insert(std::pair<Coord, ChildT*>(it.getCoord(), &(*it)));
+    static const Index DIM = RootT::ChildNodeType::DIM;
+
+    // We employ a simple z-scanline algorithm that inserts inactive tiles with
+    // the inside value if they are sandwiched between inside child nodes only!
+    typename std::map<Coord, ChildT*>::const_iterator b = nodeKeys.begin(), e = nodeKeys.end();
+    if ( b == e ) return;
+    for (typename std::map<Coord, ChildT*>::const_iterator a = b++; b != e; ++a, ++b) {
+        Coord d = b->first - a->first; // delta of neighboring coordinates
+        if (d[0]!=0 || d[1]!=0 || d[2]==Int32(DIM)) continue;// not same z-scanline or neighbors
+        const ValueT fill[] = { a->second->getLastValue(), b->second->getFirstValue() };
+        if (!(fill[0] < 0) || !(fill[1] < 0)) continue; // scanline isn't inside
+        Coord c = a->first + Coord(0u, 0u, DIM);
+        for (; c[2] != b->first[2]; c[2] += DIM) root.addTile(c, mInside, false);
+    }
+    */
+    //root.setBackground(mOutside, /*updateChildNodes=*/false);
+}// processRootKernel
+
+//================================================================================================
+
+template<typename BuildT, int LEVEL>
+__global__ void processNodeKernel(NanoTree<BuildT> *tree, size_t count)
+{
+    using NodeT = typename NanoNode<BuildT, LEVEL>::type;
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= count) return;
+    const uint32_t nValue = tid & (NodeT::SIZE - 1u);
+    auto &node = *(tree->template getFirstNode<LEVEL>() + (tid >> (3*NodeT::LOG2DIM)));
+    const auto &mask = node.childMask();
+    if (mask.isOn(nValue)) return;// ignore if child
+    auto value = tree->background();// initiate to outside value
+    auto n = mask.template findNext<true>(nValue);
+    if (n < NodeT::SIZE) {
+        if (node.getChild(n)->getFirstValue() < 0) value = -value;
+    } else if ((n = mask.template findPrev<true>(nValue)) < NodeT::SIZE) {
+        if (node.getChild(n)->getLastValue()  < 0) value = -value;
+    } else if (node.getValue(0)<0) {
+        value = -value;
+    }
+    node.setValue(nValue, value);
+}// processNodeKernel
+
+//================================================================================================
+
+template<typename BuildT>
+__global__ void processLeafKernel(NanoTree<BuildT> *tree, size_t count)
+{
+    using LeafT = NanoLeaf<BuildT>;
+    const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= count) return;
+    const uint32_t nVoxel = tid & (LeafT::SIZE - 1u);
+    auto *leaf = tree->getFirstLeaf() + (tid >> (3*LeafT::LOG2DIM));
+    const auto &mask = leaf->valueMask();
+    if (mask.isOn(nVoxel)) return;
+    auto *buffer = leaf->mValues;
+    auto n = mask.template findNext<true>(nVoxel);
+    if (n == LeafT::SIZE && (n = mask.template findPrev<true>(nVoxel)) == LeafT::SIZE) n = 0u;
+    buffer[nVoxel] = buffer[n]<0 ? -tree->background() : tree->background();
+}// processLeafKernel
+
+//================================================================================================
+
+template <typename BuildT>
+__global__ void cpyNodeCountKernel(NanoGrid<BuildT> *d_grid, uint64_t *d_count)
+{
+    NANOVDB_ASSERT(d_grid->isSequential());
+    for (int i=0; i<3; ++i) *d_count++ = d_grid->tree().nodeCount(i);
+    *d_count = d_grid->tree().root().tileCount();
+}
+
+}// anonymous namespace
+
+//================================================================================================
+
+template <typename BuildT>
+void SignedFloodFill<BuildT>::operator()(NanoGrid<BuildT> *d_grid)
+{
+    static_assert(BuildTraits<BuildT>::is_float, "cuda::SignedFloodFill only works on float grids");
+    NANOVDB_ASSERT(d_grid);
+    uint64_t count[4], *d_count = nullptr;
+    cudaCheck(util::cuda::mallocAsync((void**)&d_count, 4*sizeof(uint64_t), mStream));
+    cpyNodeCountKernel<BuildT><<<1, 1, 0, mStream>>>(d_grid, d_count);
+    cudaCheckError();
+    cudaCheck(cudaMemcpyAsync(&count, d_count, 4*sizeof(uint64_t), cudaMemcpyDeviceToHost, mStream));
+    cudaCheck(util::cuda::freeAsync(d_count, mStream));
+
+    static const int threadsPerBlock = 128;
+    auto blocksPerGrid = [&](size_t count)->uint32_t{return (count + (threadsPerBlock - 1)) / threadsPerBlock;};
+    auto *tree = reinterpret_cast<NanoTree<BuildT>*>(d_grid + 1);
+
+    if (mVerbose) mTimer.start("\nProcess leaf nodes");
+    processLeafKernel<BuildT><<<blocksPerGrid(count[0]<<9), threadsPerBlock, 0, mStream>>>(tree, count[0]<<9);
+    cudaCheckError();
+
+    if (mVerbose) mTimer.restart("Process lower internal nodes");
+    processNodeKernel<BuildT,1><<<blocksPerGrid(count[1]<<12), threadsPerBlock, 0, mStream>>>(tree, count[1]<<12);
+    cudaCheckError();
+
+    if (mVerbose) mTimer.restart("Process upper internal nodes");
+    processNodeKernel<BuildT,2><<<blocksPerGrid(count[2]<<15), threadsPerBlock, 0, mStream>>>(tree, count[2]<<15);
+    cudaCheckError();
+
+    //if (mVerbose) mTimer.restart("Process root node");
+    //processRootKernel<BuildT><<<1, 1, 0, mStream>>>(tree);
+    if (mVerbose) mTimer.stop();
+    cudaCheckError();
+}// SignedFloodFill::operator()
+
+//================================================================================================
+
+template<typename BuildT>
+typename util::enable_if<BuildTraits<BuildT>::is_float, void>::type
+signedFloodFill(NanoGrid<BuildT> *d_grid, bool verbose, cudaStream_t stream)
+{
+    SignedFloodFill<BuildT> sff(verbose, stream);
+    sff(d_grid);
+    auto *d_gridData = d_grid->data();
+    Checksum cs = getChecksum(d_gridData, stream);
+    if (cs.isFull()) {// CheckMode::Partial checksum is unaffected
+        updateChecksum(d_gridData, CheckMode::Full, stream);
+    }
+}
+
+}// namespace tools::cuda
+
+template<typename BuildT>
+[[deprecated("Use nanovdb::tools::cuda::signedFloodFill instead.")]]
+typename util::enable_if<BuildTraits<BuildT>::is_float, void>::type
+cudaSignedFloodFill(NanoGrid<BuildT> *d_grid, bool verbose = false, cudaStream_t stream = 0)
+{
+    return tools::cuda::signedFloodFill<BuildT>(d_grid, verbose, stream);
+}
+
+}// namespace nanovdb
+
+#endif // NANOVDB_TOOLS_CUDA_SIGNEDFLOODFILL_CUH_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/unittest/TestNanoVDB.cc b/nanovdb/nanovdb/unittest/TestNanoVDB.cc
index aa84b99202..4d39b443bb 100644
--- a/nanovdb/nanovdb/unittest/TestNanoVDB.cc
+++ b/nanovdb/nanovdb/unittest/TestNanoVDB.cc
@@ -13,23 +13,23 @@
 #include <cmath>
 #include <cstdlib>
 
-#include <nanovdb/util/IO.h>
-#include <nanovdb/util/CreateNanoGrid.h>
-#include <nanovdb/util/Primitives.h>
-#include <nanovdb/util/GridStats.h>
-#include <nanovdb/util/GridValidator.h>
-#include <nanovdb/util/Ray.h>
-#include <nanovdb/util/HDDA.h>
-#include <nanovdb/util/DitherLUT.h>
-#include <nanovdb/util/SampleFromVoxels.h>
-#include <nanovdb/util/Stencils.h>
+#include <nanovdb/io/IO.h>
+#include <nanovdb/tools/CreateNanoGrid.h>
+#include <nanovdb/tools/CreatePrimitives.h>
+#include <nanovdb/tools/GridStats.h>
+#include <nanovdb/tools/GridValidator.h>
+#include <nanovdb/math/Ray.h>
+#include <nanovdb/math/HDDA.h>
+#include <nanovdb/math/DitherLUT.h>
+#include <nanovdb/math/SampleFromVoxels.h>
+#include <nanovdb/math/Stencils.h>
 #include <nanovdb/util/Range.h>
 #include <nanovdb/util/ForEach.h>
 #include <nanovdb/util/Invoke.h>
 #include <nanovdb/util/Reduce.h>
-#include <nanovdb/util/GridChecksum.h>
-#include <nanovdb/util/NodeManager.h>
-#include <nanovdb/util/CpuTimer.h>
+#include <nanovdb/tools/GridChecksum.h>
+#include <nanovdb/NodeManager.h>
+#include <nanovdb/util/Timer.h>
 
 #if !defined(_MSC_VER) // does not compile in msvc c++ due to zero-sized arrays.
 #include <nanovdb/CNanoVDB.h>
@@ -63,7 +63,7 @@ struct Sphere
         const ValueT dst = this->sdf(ijk);
         return dst >= mBackground ? mBackground : dst <= -mBackground ? -mBackground : dst;
     }
-    ValueT operator()(const nanovdb::Vec3<ValueT>& p) const
+    ValueT operator()(const nanovdb::math::Vec3<ValueT>& p) const
     {
         const ValueT dst = this->sdf(p);
         return dst >= mBackground ? mBackground : dst <= -mBackground ? -mBackground : dst;
@@ -83,15 +83,15 @@ struct Sphere
     }
 
 private:
-    ValueT sdf(nanovdb::Vec3<ValueT> xyz) const
+    ValueT sdf(nanovdb::math::Vec3<ValueT> xyz) const
     {
         xyz *= mVoxelSize;
         xyz -= mCenter;
         return xyz.length() - mRadius;
     }
-    ValueT sdf(const nanovdb::Coord& ijk) const { return this->sdf(nanovdb::Vec3<ValueT>(ijk[0], ijk[1], ijk[2])); }
-    static_assert(nanovdb::is_floating_point<float>::value, "Sphere: expect floating point");
-    const nanovdb::Vec3<ValueT> mCenter;
+    ValueT sdf(const nanovdb::Coord& ijk) const { return this->sdf(nanovdb::math::Vec3<ValueT>(ijk[0], ijk[1], ijk[2])); }
+    static_assert(nanovdb::util::is_floating_point<float>::value, "Sphere: expect floating point");
+    const nanovdb::math::Vec3<ValueT> mCenter;
     const ValueT                mRadius, mVoxelSize, mBackground;
 }; // Sphere
 
@@ -163,12 +163,14 @@ class TestNanoVDB : public ::testing::Test
 
     void SetUp() override
     {
+        mStr = new char[256];
         // Code here will be called immediately after the constructor (right
         // before each test).
     }
 
     void TearDown() override
     {
+        delete [] mStr;
         // Code here will be called immediately after each test (right
         // before the destructor).
     }
@@ -185,7 +187,8 @@ class TestNanoVDB : public ::testing::Test
         const auto n = sizeof(T);
         std::cerr << "Size of " << s << ": " << n << " bytes which is" << (n % 32 == 0 ? " " : " NOT ") << "32 byte aligned" << std::endl;
     }
-    nanovdb::CpuTimer mTimer;
+    nanovdb::util::Timer mTimer;
+    char *mStr;
 }; // TestNanoVDB
 
 template <typename T>
@@ -206,28 +209,33 @@ class TestOffsets : public ::testing::Test
 
 }; // TestOffsets<T>
 
-using MyTypes = ::testing::Types<float,
-                                 double,
-                                 nanovdb::Fp4,
-                                 nanovdb::Fp8,
-                                 nanovdb::Fp16,
-                                 nanovdb::FpN,
-                                 int16_t,
-                                 int32_t,
-                                 int64_t,
-                                 nanovdb::Vec3f,
-                                 nanovdb::Vec3d,
-                                 nanovdb::ValueMask,
-                                 nanovdb::ValueIndex,
-                                 nanovdb::ValueOnIndex,
-                                 nanovdb::ValueIndexMask,
-                                 nanovdb::ValueOnIndexMask,
-                                 bool,
-                                 nanovdb::Point,
-                                 nanovdb::Vec3u8,
-                                 nanovdb::Vec3u16,
-                                 int16_t,
-                                 uint32_t>;
+// Ordering of types is identical to GridType in NanoVDB.h
+using MyTypes = ::testing::Types<float,//                     GridType::Float = 1
+                                 double,//                    GridType::Double = 2
+                                 int16_t,//                   GridType::Int16 = 3
+                                 int32_t,//                   GridType::Int32 = 4
+                                 int64_t,//                   GridType::Int64 = 5
+                                 nanovdb::Vec3f,//            GridType::Vec3f = 6
+                                 nanovdb::Vec3d,//            GridType::Vec3d = 7
+                                 nanovdb::ValueMask,//        GridType::Mask = 8
+                                 //                           GridType::Half = 9
+                                 uint32_t,//                  GridType::UInt32 = 10
+                                 bool,//                      GridType::Boolean = 11
+                                 //                           GridType::RGBA8 = 12
+                                 nanovdb::Fp4,//              GridType::Fp4 = 13
+                                 nanovdb::Fp8,//              GridType::Fp8 = 14
+                                 nanovdb::Fp16,//             GridType::Fp16 = 15
+                                 nanovdb::FpN,//              GridType::FpN = 16
+                                 //                           GridType::Vec4f = 17
+                                 //                           GridType::Vec4d = 18
+                                 nanovdb::ValueIndex,//       GridType::Index = 19
+                                 nanovdb::ValueOnIndex,//     GridType::OnIndex = 20
+                                 nanovdb::ValueIndexMask,//   GridType::IndexMask = 21
+                                 nanovdb::ValueOnIndexMask,// GridType::OnIndexMask = 22
+                                 nanovdb::Point,//            GridType::PointIndex = 23
+                                 nanovdb::Vec3u8,//           GridType::Vec3u8 = 24
+                                 nanovdb::Vec3u16,//          GridType::Vec3u16 = 25
+                                 uint8_t>;//                  GridType::UInt8 = 26
 
 TYPED_TEST_SUITE(TestOffsets, MyTypes);
 
@@ -235,6 +243,7 @@ TEST_F(TestNanoVDB, Version)
 {
     EXPECT_EQ( 4u, sizeof(uint32_t));
     EXPECT_EQ( 4u, sizeof(nanovdb::Version));
+    char str[30];
     {// default constructor
         nanovdb::Version v;
         EXPECT_EQ(uint32_t(NANOVDB_MAJOR_VERSION_NUMBER), v.getMajor());
@@ -244,10 +253,7 @@ TEST_F(TestNanoVDB, Version)
         ss << NANOVDB_MAJOR_VERSION_NUMBER << "."
            << NANOVDB_MINOR_VERSION_NUMBER << "."
            << NANOVDB_PATCH_VERSION_NUMBER;
-        auto c_str = v.c_str();
-        EXPECT_EQ(ss.str(), std::string(c_str));
-        std::free(const_cast<char*>(c_str));
-        //std::cerr << v.c_str() << std::endl;
+        EXPECT_EQ(ss.str(), std::string(nanovdb::toStr(str, v)));
     }
     {// detailed constructor
         const uint32_t major = (1u << 11) - 1;// maximum allowed value
@@ -259,10 +265,7 @@ TEST_F(TestNanoVDB, Version)
         EXPECT_EQ(patch, v.getPatch());
         std::stringstream ss;
         ss << major << "." << minor << "." << patch;
-        auto c_str = v.c_str();
-        EXPECT_EQ(ss.str(), std::string(c_str));
-        std::free(const_cast<char*>(c_str));
-        //std::cerr << v.c_str() << std::endl;
+        EXPECT_EQ(ss.str(), std::string(nanovdb::toStr(str, v)));
     }
     {// smallest possible version number
         const uint32_t major = 1u;
@@ -274,10 +277,7 @@ TEST_F(TestNanoVDB, Version)
         EXPECT_EQ(patch, v.getPatch());
         std::stringstream ss;
         ss << major << "." << minor << "." << patch;
-        auto c_str = v.c_str();
-        EXPECT_EQ(ss.str(), std::string(c_str));
-        std::free(const_cast<char*>(c_str));
-        //std::cerr << "version.data = " << v.id() << std::endl;
+        EXPECT_EQ(ss.str(), std::string(nanovdb::toStr(str, v)));
     }
     {// test comparison operators
         EXPECT_EQ( nanovdb::Version(28, 2, 7), nanovdb::Version( 28, 2, 7) );
@@ -331,7 +331,7 @@ TEST_F(TestNanoVDB, Version)
             EXPECT_LT(tmp.version, T(29,0,0).version);
         }
     }
-}
+}// Version
 
 TEST_F(TestNanoVDB, Basic)
 {
@@ -366,38 +366,86 @@ TEST_F(TestNanoVDB, Basic)
         EXPECT_EQ(i, *j);
         //std::cerr << "i="<<i<<" j="<<*j<<std::endl;
     }
-}
+}// Basic
 
 TEST_F(TestNanoVDB, toStr)
 {
-    EXPECT_EQ( strcmp(nanovdb::toStr( nanovdb::GridType::Unknown ), "?"), 0 );
-    EXPECT_EQ( strcmp(nanovdb::toStr( nanovdb::GridType::Float ), "float"), 0 );
-    EXPECT_EQ( strcmp(nanovdb::toStr( nanovdb::GridType::Double ), "double"), 0 );
-    EXPECT_EQ( strcmp(nanovdb::toStr( nanovdb::GridType::Int16 ), "int16"), 0 );
-    EXPECT_EQ( strcmp(nanovdb::toStr( nanovdb::GridType::Int32 ), "int32"), 0 );
-    EXPECT_EQ( strcmp(nanovdb::toStr( nanovdb::GridType::Int64 ), "int64"), 0 );
-    EXPECT_EQ( strcmp(nanovdb::toStr( nanovdb::GridType::Vec3f ), "Vec3f"), 0 );
-    EXPECT_EQ( strcmp(nanovdb::toStr( nanovdb::GridType::Vec3d ), "Vec3d"), 0 );
-    EXPECT_EQ( strcmp(nanovdb::toStr( nanovdb::GridType::Mask ),  "Mask"), 0 );
-    EXPECT_EQ( strcmp(nanovdb::toStr( nanovdb::GridType::Half ),  "Half"), 0 );
-    EXPECT_EQ( strcmp(nanovdb::toStr( nanovdb::GridType::UInt32 ), "uint32"), 0 );
-    EXPECT_EQ( strcmp(nanovdb::toStr( nanovdb::GridType::Boolean ), "bool"), 0 );
-    EXPECT_EQ( strcmp(nanovdb::toStr( nanovdb::GridType::RGBA8 ), "RGBA8"), 0 );
-    EXPECT_EQ( strcmp(nanovdb::toStr( nanovdb::GridType::Fp4 ), "Float4"), 0 );
-    EXPECT_EQ( strcmp(nanovdb::toStr( nanovdb::GridType::Fp8 ), "Float8"), 0 );
-    EXPECT_EQ( strcmp(nanovdb::toStr( nanovdb::GridType::Fp16 ), "Float16"), 0 );
-    EXPECT_EQ( strcmp(nanovdb::toStr( nanovdb::GridType::FpN ), "FloatN"), 0 );
-    EXPECT_EQ( strcmp(nanovdb::toStr( nanovdb::GridType::Vec4f ), "Vec4f"), 0 );
-    EXPECT_EQ( strcmp(nanovdb::toStr( nanovdb::GridType::Vec4d ), "Vec4d"), 0 );
-    EXPECT_EQ( strcmp(nanovdb::toStr( nanovdb::GridType::Index ), "Index"), 0 );
-    EXPECT_EQ( strcmp(nanovdb::toStr( nanovdb::GridType::OnIndex ), "OnIndex"), 0 );
-    EXPECT_EQ( strcmp(nanovdb::toStr( nanovdb::GridType::IndexMask ), "IndexMask"), 0 );
-    EXPECT_EQ( strcmp(nanovdb::toStr( nanovdb::GridType::OnIndexMask ), "OnIndexMask"), 0 );
-    EXPECT_EQ( strcmp(nanovdb::toStr( nanovdb::GridType::PointIndex ), "PointIndex"), 0 );
-    EXPECT_EQ( strcmp(nanovdb::toStr( nanovdb::GridType::Vec3u8 ), "Vec3u8"), 0 );
-    EXPECT_EQ( strcmp(nanovdb::toStr( nanovdb::GridType::Vec3u16 ), "Vec3u16"), 0 );
-    EXPECT_EQ( strcmp(nanovdb::toStr( nanovdb::GridType::End ), "End"), 0 );
-}
+    {// toStr(GridType)
+        EXPECT_EQ(12, nanovdb::strlen<nanovdb::GridType>());
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridType::Unknown ), "?"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridType::Float ), "float"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridType::Double ), "double"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridType::Int16 ), "int16"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridType::Int32 ), "int32"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridType::Int64 ), "int64"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridType::Vec3f ), "Vec3f"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridType::Vec3d ), "Vec3d"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridType::Mask ),  "Mask"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridType::Half ),  "Half"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridType::UInt32 ), "uint32"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridType::Boolean ), "bool"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridType::RGBA8 ), "RGBA8"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridType::Fp4 ), "Float4"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridType::Fp8 ), "Float8"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridType::Fp16 ), "Float16"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridType::FpN ), "FloatN"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridType::Vec4f ), "Vec4f"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridType::Vec4d ), "Vec4d"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridType::Index ), "Index"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridType::OnIndex ), "OnIndex"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridType::IndexMask ), "IndexMask"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridType::OnIndexMask ), "OnIndexMask"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridType::PointIndex ), "PointIndex"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridType::Vec3u8 ), "Vec3u8"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridType::Vec3u16 ), "Vec3u16"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridType::End ), "End"), 0 );
+    }
+    {// toStr(GridClass)
+        EXPECT_EQ(7, nanovdb::strlen<nanovdb::GridClass>());
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridClass::Unknown ), "?"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridClass::LevelSet ), "SDF"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridClass::FogVolume ), "FOG"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridClass::Staggered ), "MAC"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridClass::PointIndex ), "PNTIDX"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridClass::PointData ), "PNTDAT"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridClass::Topology ), "TOPO"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridClass::VoxelVolume ), "VOX"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridClass::IndexGrid ),  "INDEX"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridClass::TensorGrid ),  "TENSOR"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridClass::End ), "END"), 0 );
+    }
+    {// toStr(GridFlags)
+        EXPECT_EQ(23, nanovdb::strlen<nanovdb::GridFlags>());
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridFlags::HasLongGridName ), "has long grid name"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridFlags::HasBBox ), "has bbox"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridFlags::HasMinMax ), "has min/max"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridFlags::HasAverage ), "has average"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridFlags::HasStdDeviation ), "has standard deviation"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridFlags::IsBreadthFirst ), "is breadth-first"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::GridFlags::End ), "end"), 0 );
+    }
+     {// toStr(Codec)
+        EXPECT_EQ(6, nanovdb::strlen<nanovdb::io::Codec>());
+        EXPECT_EQ( strcmp(nanovdb::io::toStr(mStr, nanovdb::io::Codec::NONE ),  "NONE"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::io::toStr(mStr, nanovdb::io::Codec::ZIP ),   "ZIP"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::io::toStr(mStr, nanovdb::io::Codec::BLOSC ), "BLOSC"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::io::toStr(mStr, nanovdb::io::Codec::End ),   "END"), 0 );
+    }
+    {// toStr(version)
+        EXPECT_EQ(8, nanovdb::strlen<nanovdb::Version>());
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::Version(12,34,56) ),  "12.34.56"), 0 );
+    }
+    {// toStr(MagicType)
+        EXPECT_EQ(25, nanovdb::strlen<nanovdb::MagicType>());
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::MagicType::Unknown ),  "unknown"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::MagicType::OpenVDB ),  "openvdb"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::MagicType::NanoVDB ),  "nanovdb"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::MagicType::NanoGrid ), "nanovdb::Grid"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::MagicType::NanoFile ), "nanovdb::File"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::MagicType::NanoNode ), "nanovdb::NodeManager"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::MagicType::NanoFrag ), "fragmented nanovdb::Grid"), 0 );
+    }
+}// toStr
 
 TEST_F(TestNanoVDB, Assumptions)
 {
@@ -417,14 +465,14 @@ TEST_F(TestNanoVDB, Assumptions)
     EXPECT_EQ(-1, b.a.i);
     EXPECT_EQ(reinterpret_cast<uint8_t*>(&b), reinterpret_cast<uint8_t*>(&(b.a)));
     EXPECT_EQ(reinterpret_cast<uint8_t*>(&(b.a)), reinterpret_cast<uint8_t*>(&(b.a.i)));
-    EXPECT_EQ(nanovdb::AlignUp<32>(48), 64U);
-    EXPECT_EQ(nanovdb::AlignUp<8>(16), 16U);
-}
+    EXPECT_EQ(nanovdb::math::AlignUp<32>(48), 64U);
+    EXPECT_EQ(nanovdb::math::AlignUp<8>(16), 16U);
+}// Assumptions
 
 TEST_F(TestNanoVDB, Magic)
 {
-    EXPECT_EQ(0x304244566f6e614eUL, NANOVDB_MAGIC_NUMBER); // Magic number: "NanoVDB0" in hex)
-    EXPECT_EQ(0x4e616e6f56444230UL, nanovdb::io::reverseEndianness(NANOVDB_MAGIC_NUMBER));
+    EXPECT_EQ(0x304244566f6e614eUL, NANOVDB_MAGIC_NUMB); // Magic number: "NanoVDB0" in hex)
+    EXPECT_EQ(0x4e616e6f56444230UL, nanovdb::io::reverseEndianness(NANOVDB_MAGIC_NUMB));
 
     // Verify little endian representation
     const char* str = "NanoVDB0"; // note it's exactly 8 bytes
@@ -439,7 +487,7 @@ TEST_F(TestNanoVDB, Magic)
 
     uint64_t magic;
     ss1 >> magic;
-    EXPECT_EQ(magic, NANOVDB_MAGIC_NUMBER);
+    EXPECT_EQ(magic, NANOVDB_MAGIC_NUMB);
 
     // Verify big endian representation
     std::stringstream ss2;
@@ -451,11 +499,11 @@ TEST_F(TestNanoVDB, Magic)
     EXPECT_EQ("0x4e616e6f56444230UL", ss2.str());
 
     ss2 >> magic;
-    EXPECT_EQ(magic, nanovdb::io::reverseEndianness(NANOVDB_MAGIC_NUMBER));
+    EXPECT_EQ(magic, nanovdb::io::reverseEndianness(NANOVDB_MAGIC_NUMB));
 
     {// test all magic numbers
         const std::string a_str("NanoVDB0"), b_str("NanoVDB1"), c_str("NanoVDB2");
-        const uint64_t a = NANOVDB_MAGIC_NUMBER;// NanoVDB0
+        const uint64_t a = NANOVDB_MAGIC_NUMB;// NanoVDB0
         const uint64_t b = NANOVDB_MAGIC_GRID;//   NanoVDB1
         const uint64_t c = NANOVDB_MAGIC_FILE;//   NanoVDB2
         const uint64_t m = NANOVDB_MAGIC_MASK;//   masks out most significant byte
@@ -473,7 +521,7 @@ TEST_F(TestNanoVDB, Magic)
         EXPECT_EQ('1', bb[7]);
         EXPECT_EQ('2', cc[7]);
         EXPECT_EQ(m & a, m & b);
-        EXPECT_EQ(NANOVDB_MAGIC_MASK & NANOVDB_MAGIC_NUMBER, NANOVDB_MAGIC_MASK & NANOVDB_MAGIC_FILE);
+        EXPECT_EQ(NANOVDB_MAGIC_MASK & NANOVDB_MAGIC_NUMB, NANOVDB_MAGIC_MASK & NANOVDB_MAGIC_FILE);
     }
 }// Magic
 
@@ -481,74 +529,68 @@ TEST_F(TestNanoVDB, FindBits)
 {
     for (uint32_t i = 0; i < 32; ++i) {
         uint32_t word = uint32_t(1) << i;
-        EXPECT_EQ(i, nanovdb::FindLowestOn(word));
-        EXPECT_EQ(i, nanovdb::FindHighestOn(word));
+        EXPECT_EQ(i, nanovdb::util::findLowestOn(word));
+        EXPECT_EQ(i, nanovdb::util::findHighestOn(word));
     }
     for (uint32_t i = 0; i < 64; ++i) {
         uint64_t word = uint64_t(1) << i;
-        EXPECT_EQ(i, nanovdb::FindLowestOn(word));
-        EXPECT_EQ(i, nanovdb::FindHighestOn(word));
+        EXPECT_EQ(i, nanovdb::util::findLowestOn(word));
+        EXPECT_EQ(i, nanovdb::util::findHighestOn(word));
     }
-}
+}// FindBits
 
 TEST_F(TestNanoVDB, CRC32)
 {
     { // test function that uses iterators
         const std::string s{"The quick brown fox jumps over the lazy dog"};
         std::stringstream ss;
-        ss << std::hex << std::setw(8) << std::setfill('0') << nanovdb::crc32::checksum(s.c_str(), s.size());
+        ss << std::hex << std::setw(8) << std::setfill('0') << nanovdb::util::crc32(s.c_str(), s.size());
         EXPECT_EQ("414fa339", ss.str());
     }
     { // test the checksum for a modified string
         const std::string s{"The quick brown Fox jumps over the lazy dog"};
         std::stringstream ss;
-        ss << std::hex << std::setw(8) << std::setfill('0') << nanovdb::crc32::checksum(s.c_str(), s.size());
+        ss << std::hex << std::setw(8) << std::setfill('0') << nanovdb::util::crc32(s.c_str(), s.size());
         EXPECT_NE("414fa339", ss.str());
     }
     { // test function that uses void pointer and byte size
         const std::string s{"The quick brown fox jumps over the lazy dog"};
         std::stringstream ss;
-        ss << std::hex << std::setw(8) << std::setfill('0') << nanovdb::crc32::checksum(s.c_str(), s.size());
+        ss << std::hex << std::setw(8) << std::setfill('0') << nanovdb::util::crc32(s.c_str(), s.size());
         EXPECT_EQ("414fa339", ss.str());
     }
     { // test accumulation
         const std::string s1{"The quick brown fox jum"};
-        uint32_t crc = nanovdb::crc32::checksum(s1.c_str(), s1.size());
+        uint32_t crc = nanovdb::util::crc32(s1.c_str(), s1.size());
         const std::string s2{"ps over the lazy dog"};
-        crc = nanovdb::crc32::checksum(s2.c_str(), s2.size(), crc);
+        crc = nanovdb::util::crc32(s2.c_str(), s2.size(), crc);
         std::stringstream ss;
         ss << std::hex << std::setw(8) << std::setfill('0') << crc;
         EXPECT_EQ("414fa339", ss.str());
     }
     { // test accumulation with lookup table
-        auto lut = nanovdb::crc32::createLut();
+        auto lut = nanovdb::util::createCrc32Lut();
         const std::string s1{"The quick brown fox jum"};
-        uint32_t crc = nanovdb::crc32::checksum(s1.c_str(), s1.size(), lut.get());
+        uint32_t crc = nanovdb::util::crc32(s1.c_str(), s1.size(), lut.get());
         const std::string s2{"ps over the lazy dog"};
-        crc = nanovdb::crc32::checksum(s2.c_str(), s2.size(), lut.get(), crc);
+        crc = nanovdb::util::crc32(s2.c_str(), s2.size(), lut.get(), crc);
         std::stringstream ss;
         ss << std::hex << std::setw(8) << std::setfill('0') << crc;
         EXPECT_EQ("414fa339", ss.str());
     }
     {
-        //EXPECT_EQ(~uint64_t(0), nanovdb::GridChecksum::EMPTY);
-        nanovdb::GridChecksum cs(~uint64_t(0));
-        EXPECT_EQ(nanovdb::ChecksumMode::Disable, cs.mode());
+        EXPECT_EQ(sizeof(uint64_t), sizeof(nanovdb::Checksum));
+        nanovdb::Checksum cs;
+        EXPECT_EQ(nanovdb::CheckMode::Disable, cs.mode());
+        EXPECT_EQ(~uint64_t(0), cs.full());
         EXPECT_TRUE(cs.isEmpty());
         EXPECT_FALSE(cs.isFull());
     }
-    {
-        nanovdb::GridChecksum cs;
-        EXPECT_EQ(~uint64_t(0), cs.checksum());
-        EXPECT_EQ(nanovdb::ChecksumMode::Disable, cs.mode());
-        EXPECT_TRUE(cs.isEmpty());
-        EXPECT_FALSE(cs.isFull());
-    }
-}
+}// CRC32
 
 TEST_F(TestNanoVDB, Range1D)
 {
-    nanovdb::Range1D r1(0, 20, 2);
+    nanovdb::util::Range1D r1(0, 20, 2);
     EXPECT_FALSE(r1.empty());
     EXPECT_EQ(2U, r1.grainsize());
     EXPECT_EQ(20U, r1.size());
@@ -557,7 +599,7 @@ TEST_F(TestNanoVDB, Range1D)
     EXPECT_EQ(0U, r1.begin());
     EXPECT_EQ(20U, r1.end());
 
-    nanovdb::Range1D r2(r1, nanovdb::Split());
+    nanovdb::util::Range1D r2(r1, nanovdb::util::Split());
 
     EXPECT_FALSE(r1.empty());
     EXPECT_EQ(2U, r1.grainsize());
@@ -574,11 +616,11 @@ TEST_F(TestNanoVDB, Range1D)
     EXPECT_TRUE(r2.is_divisible());
     EXPECT_EQ(10U, r2.begin());
     EXPECT_EQ(20U, r2.end());
-}
+}// Range1D
 
 TEST_F(TestNanoVDB, Range2D)
 {
-    nanovdb::Range<2, int> r1(-20, 20, 1u, 0, 20, 2u);
+    nanovdb::util::Range<2, int> r1(-20, 20, 1u, 0, 20, 2u);
 
     EXPECT_FALSE(r1.empty());
     EXPECT_EQ(1U, r1[0].grainsize());
@@ -595,7 +637,7 @@ TEST_F(TestNanoVDB, Range2D)
     EXPECT_EQ(0, r1[1].begin());
     EXPECT_EQ(20, r1[1].end());
 
-    nanovdb::Range<2, int> r2(r1, nanovdb::Split());
+    nanovdb::util::Range<2, int> r2(r1, nanovdb::util::Split());
 
     EXPECT_FALSE(r1.empty());
     EXPECT_EQ(1U, r1[0].grainsize());
@@ -627,11 +669,11 @@ TEST_F(TestNanoVDB, Range2D)
     EXPECT_EQ(0, r2[1].begin());
     EXPECT_EQ(20, r2[1].end());
     EXPECT_EQ(r1[1], r2[1]);
-}
+}// Range2D
 
 TEST_F(TestNanoVDB, Range3D)
 {
-    nanovdb::Range<3, int> r1(-20, 20, 1u, 0, 20, 2u, 0, 10, 5);
+    nanovdb::util::Range<3, int> r1(-20, 20, 1u, 0, 20, 2u, 0, 10, 5);
 
     EXPECT_FALSE(r1.empty());
     EXPECT_EQ(1U, r1[0].grainsize());
@@ -655,7 +697,7 @@ TEST_F(TestNanoVDB, Range3D)
     EXPECT_EQ(0, r1[2].begin());
     EXPECT_EQ(10, r1[2].end());
 
-    nanovdb::Range<3, int> r2(r1, nanovdb::Split());
+    nanovdb::util::Range<3, int> r2(r1, nanovdb::util::Split());
 
     EXPECT_FALSE(r1.empty());
     EXPECT_EQ(1U, r1[0].grainsize());
@@ -702,7 +744,7 @@ TEST_F(TestNanoVDB, Range3D)
     EXPECT_EQ(0, r2[2].begin());
     EXPECT_EQ(10, r2[2].end());
     EXPECT_EQ(r1[2], r2[2]);
-}
+}// Range3D
 
 TEST_F(TestNanoVDB, invoke)
 {
@@ -715,11 +757,11 @@ TEST_F(TestNanoVDB, invoke)
     auto kernel1 = [&array](){array[1]=1; };
     auto kernel2 = [&array](){array[2]=2; };
     auto kernel3 = [&array](){array[3]=3; };
-    nanovdb::invoke(kernel0, kernel1, kernel2, kernel3);
+    nanovdb::util::invoke(kernel0, kernel1, kernel2, kernel3);
     for (int i=0; i<size; ++i) {
         EXPECT_EQ(i, array[i]);
     }
-}
+}// invoke
 
 TEST_F(TestNanoVDB, forEach)
 {
@@ -728,12 +770,12 @@ TEST_F(TestNanoVDB, forEach)
     for (int i=0; i<size; ++i) {
         EXPECT_EQ(0, array[i]);
     }
-    auto kernel = [&array](const nanovdb::Range1D &r){for (auto i=r.begin(); i!=r.end(); ++i) array[i]=i; };
-    nanovdb::forEach(array, kernel);
+    auto kernel = [&array](const nanovdb::util::Range1D &r){for (auto i=r.begin(); i!=r.end(); ++i) array[i]=i; };
+    nanovdb::util::forEach(array, kernel);
     for (int i=0; i<size; ++i) {
         EXPECT_EQ(i, array[i]);
     }
-}
+}// forEach
 
 TEST_F(TestNanoVDB, reduce)
 {
@@ -745,15 +787,15 @@ TEST_F(TestNanoVDB, reduce)
         expected += i;
     }
     const int identity = 0;
-    auto func = [&array](const nanovdb::Range1D &r, int a){for (auto i=r.begin(); i!=r.end(); ++i) a+=array[i]; return a; };
+    auto func = [&array](const nanovdb::util::Range1D &r, int a){for (auto i=r.begin(); i!=r.end(); ++i) a+=array[i]; return a; };
     auto join = [](int a, int b){return a + b;};
-    EXPECT_EQ(expected, nanovdb::reduce(nanovdb::Range1D(0, size), identity, func, join));
-    EXPECT_EQ(expected, nanovdb::reduce(array, identity, func, join));
-    EXPECT_EQ(expected, nanovdb::reduce(array, 8, identity, func, join));
+    EXPECT_EQ(expected, nanovdb::util::reduce(nanovdb::util::Range1D(0, size), identity, func, join));
+    EXPECT_EQ(expected, nanovdb::util::reduce(array, identity, func, join));
+    EXPECT_EQ(expected, nanovdb::util::reduce(array, 8, identity, func, join));
     for (int i=0; i<size; ++i) {
         EXPECT_EQ(i, array[i]);
     }
-}
+}// reduce
 
 TEST_F(TestNanoVDB, prefixSum)
 {
@@ -770,7 +812,7 @@ TEST_F(TestNanoVDB, prefixSum)
         EXPECT_EQ(1, array[1]);
         EXPECT_EQ(size-1, array.back());
         //mTimer.start("multi-threaded inclusive prefix sum");
-        EXPECT_EQ(sum, nanovdb::prefixSum(array, true));
+        EXPECT_EQ(sum, nanovdb::util::prefixSum(array, true));
         //mTimer.stop();
         EXPECT_EQ(size, array.size());
         EXPECT_EQ(0u, array[0]);// first element of input vector
@@ -791,7 +833,7 @@ TEST_F(TestNanoVDB, prefixSum)
         EXPECT_EQ(1, array[1]);
         EXPECT_EQ(size-1, array.back());
         //mTimer.start("serial inclusive prefix sum");
-        EXPECT_EQ(sum, nanovdb::prefixSum(array, false));
+        EXPECT_EQ(sum, nanovdb::util::prefixSum(array, false));
         //mTimer.stop();
         EXPECT_EQ(size, array.size());
         EXPECT_EQ(0u, array[0]);// first element of input vector
@@ -804,7 +846,7 @@ TEST_F(TestNanoVDB, prefixSum)
 
 TEST_F(TestNanoVDB, DitherLUT)
 {
-    nanovdb::DitherLUT lut;
+    nanovdb::math::DitherLUT lut;
     float min = 1.0f, max = 0.0f;
     for (int i=-10; i<1024; ++i) {
         const float offset = lut(i);
@@ -814,62 +856,62 @@ TEST_F(TestNanoVDB, DitherLUT)
         EXPECT_TRUE( offset < 1.0f);
     }
     //std::cout << "Dither: min = " << min << ", max = " << max << std::endl;
-}
+}// DitherLUT
 
 TEST_F(TestNanoVDB, Traits)
 {
     {// is_same
-        bool test = nanovdb::is_same<float, float>::value;
+        bool test = nanovdb::util::is_same<float, float>::value;
         EXPECT_TRUE(test);
-        test = nanovdb::is_same<float, const float>::value;
+        test = nanovdb::util::is_same<float, const float>::value;
         EXPECT_FALSE(test);
-        test = nanovdb::is_same<float, int>::value;
+        test = nanovdb::util::is_same<float, int>::value;
         EXPECT_FALSE(test);
-        test = nanovdb::is_same<int, float>::value;
+        test = nanovdb::util::is_same<int, float>::value;
         EXPECT_FALSE(test);
     }
     {// float
         using A = typename nanovdb::BuildToValueMap<float>::Type;
-        bool test = nanovdb::is_same<A, float>::value;
+        bool test = nanovdb::util::is_same<A, float>::value;
         EXPECT_TRUE(test);
         using B = typename nanovdb::TensorTraits<float>::ElementType;
-        test = nanovdb::is_same<B, float>::value;
+        test = nanovdb::util::is_same<B, float>::value;
         EXPECT_TRUE(test);
         using C = typename nanovdb::FloatTraits<float>::FloatType;
-        test = nanovdb::is_same<C, float>::value;
+        test = nanovdb::util::is_same<C, float>::value;
         EXPECT_TRUE(test);
     }
     {// Vec3f
         using A = typename nanovdb::BuildToValueMap<nanovdb::Vec3f>::Type;
-        bool test = nanovdb::is_same<A, nanovdb::Vec3f>::value;
+        bool test = nanovdb::util::is_same<A, nanovdb::Vec3f>::value;
         EXPECT_TRUE(test);
         using B = typename nanovdb::TensorTraits<nanovdb::Vec3f>::ElementType;
-        test = nanovdb::is_same<B, float>::value;
+        test = nanovdb::util::is_same<B, float>::value;
         EXPECT_TRUE(test);
         using C = typename nanovdb::FloatTraits<nanovdb::Vec3f>::FloatType;
-        test = nanovdb::is_same<C, float>::value;
+        test = nanovdb::util::is_same<C, float>::value;
         EXPECT_TRUE(test);
     }
     {// ValueMask
         using A = typename nanovdb::BuildToValueMap<nanovdb::ValueMask>::Type;
-        bool test = nanovdb::is_same<A, bool>::value;
+        bool test = nanovdb::util::is_same<A, bool>::value;
         EXPECT_TRUE(test);
         using B = typename nanovdb::TensorTraits<nanovdb::ValueMask>::ElementType;
-        test = nanovdb::is_same<B, nanovdb::ValueMask>::value;
+        test = nanovdb::util::is_same<B, nanovdb::ValueMask>::value;
         EXPECT_TRUE(test);
         using C = typename nanovdb::FloatTraits<nanovdb::ValueMask>::FloatType;
-        test = nanovdb::is_same<C, bool>::value;
+        test = nanovdb::util::is_same<C, bool>::value;
         EXPECT_TRUE(test);
     }
     {// ValueIndex
         using A = typename nanovdb::BuildToValueMap<nanovdb::ValueIndex>::Type;
-        bool test = nanovdb::is_same<A, uint64_t>::value;
+        bool test = nanovdb::util::is_same<A, uint64_t>::value;
         EXPECT_TRUE(test);
         using B = typename nanovdb::TensorTraits<nanovdb::ValueIndex>::ElementType;
-        test = nanovdb::is_same<B, nanovdb::ValueIndex>::value;
+        test = nanovdb::util::is_same<B, nanovdb::ValueIndex>::value;
         EXPECT_TRUE(test);
         using C = typename nanovdb::FloatTraits<nanovdb::ValueIndex>::FloatType;
-        test = nanovdb::is_same<C, uint64_t>::value;
+        test = nanovdb::util::is_same<C, uint64_t>::value;
         EXPECT_TRUE(test);
     }
     {// nanovdb::BuildTraits
@@ -900,26 +942,40 @@ TEST_F(TestNanoVDB, Traits)
         test = nanovdb::BuildTraits<nanovdb::FpN>::is_FpX;
         EXPECT_FALSE(test);
     }
-    {// nanovdb::is_specialization
-        bool test = nanovdb::is_specialization<nanovdb::Vec3<float>,nanovdb::Vec3>::value;
+    {// nanovdb::util::is_specialization
+        bool test = nanovdb::util::is_specialization<nanovdb::math::Vec3<float>,nanovdb::math::Vec3>::value;
         EXPECT_TRUE(test);
-        test = nanovdb::is_specialization<nanovdb::Vec3f,nanovdb::Vec3>::value;
+        test = nanovdb::util::is_specialization<nanovdb::Vec3f,nanovdb::math::Vec3>::value;
         EXPECT_TRUE(test);
-        test = nanovdb::is_specialization<nanovdb::Vec3f,nanovdb::Vec4>::value;
+        test = nanovdb::util::is_specialization<nanovdb::Vec3f,nanovdb::math::Vec4>::value;
         EXPECT_FALSE(test);
         using VecT = std::vector<float>;
-        test = nanovdb::is_specialization<VecT,std::vector>::value;
+        test = nanovdb::util::is_specialization<VecT,std::vector>::value;
         EXPECT_TRUE(test);
-        test = nanovdb::is_specialization<VecT,nanovdb::Vec3>::value;
+        test = nanovdb::util::is_specialization<VecT,nanovdb::math::Vec3>::value;
         EXPECT_FALSE(test);
     }
-}
+    {// nanovdb::util::is_pointer
+        bool test = nanovdb::util::is_pointer<int>::value;
+        EXPECT_FALSE(test);
+        test = nanovdb::util::is_pointer<int*>::value;
+        EXPECT_TRUE(test);
+        test = nanovdb::util::is_pointer<const int*>::value;
+        EXPECT_TRUE(test);
+    }
+    {// nanovdb::util::conditional
+        bool test = nanovdb::util::is_same<int, nanovdb::util::conditional<true, int, float>::type>::value;
+        EXPECT_TRUE(test);
+        test = nanovdb::util::is_same<float, nanovdb::util::conditional<false, int, float>::type>::value;
+        EXPECT_TRUE(test);
+    }
+}// Traits
 
 TEST_F(TestNanoVDB, Rgba8)
 {
-    EXPECT_EQ(sizeof(uint32_t), sizeof(nanovdb::Rgba8));
+    EXPECT_EQ(sizeof(uint32_t), sizeof(nanovdb::math::Rgba8));
     {
-        nanovdb::Rgba8 p;
+        nanovdb::math::Rgba8 p;
         EXPECT_EQ(0u, p[0]);
         EXPECT_EQ(0u, p[1]);
         EXPECT_EQ(0u, p[2]);
@@ -929,10 +985,10 @@ TEST_F(TestNanoVDB, Rgba8)
         EXPECT_EQ(0u, p.b());
         EXPECT_EQ(0u, p.a());
         EXPECT_EQ(0u, p.packed());
-        EXPECT_EQ(nanovdb::Rgba8(), p);
+        EXPECT_EQ(nanovdb::math::Rgba8(), p);
     }
     {
-        nanovdb::Rgba8 p(uint8_t(1));
+        nanovdb::math::Rgba8 p(uint8_t(1));
         EXPECT_EQ(1u, p[0]);
         EXPECT_EQ(1u, p[1]);
         EXPECT_EQ(1u, p[2]);
@@ -941,10 +997,10 @@ TEST_F(TestNanoVDB, Rgba8)
         EXPECT_EQ(1u, p.g());
         EXPECT_EQ(1u, p.b());
         EXPECT_EQ(1u, p.a());
-        EXPECT_LT(nanovdb::Rgba8(), p);
+        EXPECT_LT(nanovdb::math::Rgba8(), p);
     }
     {
-        nanovdb::Rgba8 p(uint8_t(1), uint8_t(2), uint8_t(3), uint8_t(4));
+        nanovdb::math::Rgba8 p(uint8_t(1), uint8_t(2), uint8_t(3), uint8_t(4));
         EXPECT_EQ(1u, p[0]);
         EXPECT_EQ(2u, p[1]);
         EXPECT_EQ(3u, p[2]);
@@ -953,10 +1009,10 @@ TEST_F(TestNanoVDB, Rgba8)
         EXPECT_EQ(2u, p.g());
         EXPECT_EQ(3u, p.b());
         EXPECT_EQ(4u, p.a());
-        EXPECT_LT(nanovdb::Rgba8(), p);
+        EXPECT_LT(nanovdb::math::Rgba8(), p);
     }
     {
-        nanovdb::Rgba8 p(uint8_t(255), uint8_t(255), uint8_t(255), uint8_t(255));
+        nanovdb::math::Rgba8 p(uint8_t(255), uint8_t(255), uint8_t(255), uint8_t(255));
         EXPECT_EQ(255u, p[0]);
         EXPECT_EQ(255u, p[1]);
         EXPECT_EQ(255u, p[2]);
@@ -965,12 +1021,12 @@ TEST_F(TestNanoVDB, Rgba8)
         EXPECT_EQ(255u, p.g());
         EXPECT_EQ(255u, p.b());
         EXPECT_EQ(255u, p.a());
-        EXPECT_LT(nanovdb::Rgba8(), p);
+        EXPECT_LT(nanovdb::math::Rgba8(), p);
         EXPECT_NEAR(p.lengthSqr(), 3.0f, 1e-6);
         EXPECT_NEAR(p.length(), sqrt(3.0f), 1e-6);
     }
     {
-        nanovdb::Rgba8 p(1.0f, 0.0f, 0.0f, 1.0f);
+        nanovdb::math::Rgba8 p(1.0f, 0.0f, 0.0f, 1.0f);
         EXPECT_EQ(255u, p[0]);
         EXPECT_EQ(0u,   p[1]);
         EXPECT_EQ(0u,   p[2]);
@@ -979,12 +1035,12 @@ TEST_F(TestNanoVDB, Rgba8)
         EXPECT_EQ(0u,   p.g());
         EXPECT_EQ(0u,   p.b());
         EXPECT_EQ(255u, p.a());
-        EXPECT_LT(nanovdb::Rgba8(), p);
+        EXPECT_LT(nanovdb::math::Rgba8(), p);
         EXPECT_NEAR(p.lengthSqr(), 1.0f, 1e-6);
         EXPECT_NEAR(p.length(), 1.0f, 1e-6);
     }
     {
-        nanovdb::Rgba8 p(0.0f, 1.0f, 0.5f, 0.1f);
+        nanovdb::math::Rgba8 p(0.0f, 1.0f, 0.5f, 0.1f);
         EXPECT_EQ(0u,   p[0]);
         EXPECT_EQ(255u, p[1]);
         EXPECT_EQ(128u, p[2]);
@@ -993,9 +1049,9 @@ TEST_F(TestNanoVDB, Rgba8)
         EXPECT_EQ(255u, p.g());
         EXPECT_EQ(128u, p.b());
         EXPECT_EQ(26u,  p.a());
-        EXPECT_LT(nanovdb::Rgba8(), p);
+        EXPECT_LT(nanovdb::math::Rgba8(), p);
     }
-}
+}// Rgba8
 
 TEST_F(TestNanoVDB, Coord)
 {
@@ -1040,11 +1096,26 @@ TEST_F(TestNanoVDB, Coord)
         for (int i = 0; i < 5; ++i)
             EXPECT_EQ(i / 2, i >> 1);
     }
-}
+    { // comparison operators
+        nanovdb::Coord ijk(1, 2, 3);
+        for(int i=0; i<3; ++i) {
+            nanovdb::Coord nijk(ijk);
+            nijk[i] *= -1;
+            EXPECT_EQ(nijk, nijk);
+            EXPECT_NE(ijk, nijk);
+            EXPECT_LT(nijk, ijk);
+            EXPECT_LE(nijk, ijk);
+            EXPECT_LE(ijk, ijk);
+            EXPECT_GT(ijk, nijk);
+            EXPECT_GE(ijk, nijk);
+            EXPECT_GE(nijk, nijk);
+        }
+    }
+}// Coord
 
 TEST_F(TestNanoVDB, BBox)
 {
-    nanovdb::BBox<nanovdb::Vec3f> bbox;
+    nanovdb::math::BBox<nanovdb::Vec3f> bbox;
     EXPECT_EQ(sizeof(bbox), size_t(2 * 3 * 4));
     EXPECT_EQ(std::numeric_limits<float>::max(), bbox[0][0]);
     EXPECT_EQ(std::numeric_limits<float>::max(), bbox[0][1]);
@@ -1076,7 +1147,7 @@ TEST_F(TestNanoVDB, BBox)
     EXPECT_EQ(58.0f, bbox[1][0]);
     EXPECT_EQ(0.0f, bbox[1][1]);
     EXPECT_EQ(62.0f, bbox[1][2]);
-}
+}// BBox
 
 TEST_F(TestNanoVDB, CoordBBox)
 {
@@ -1182,23 +1253,23 @@ TEST_F(TestNanoVDB, CoordBBox)
         EXPECT_EQ(nanovdb::Coord(-7,-7,-7), nanovdb::CoordBBox::createCube(-7, 0).min());
         EXPECT_EQ(nanovdb::Coord( 0, 0, 0), nanovdb::CoordBBox::createCube(-7, 0).max());
     }
-}
+}// CoordBBox
 
 TEST_F(TestNanoVDB, Vec3)
 {
-    bool test = nanovdb::is_specialization<double, nanovdb::Vec3>::value;
+    bool test = nanovdb::util::is_specialization<double, nanovdb::math::Vec3>::value;
     EXPECT_FALSE(test);
     test = nanovdb::TensorTraits<double>::IsVector;
     EXPECT_FALSE(test);
-    test = nanovdb::is_specialization<nanovdb::Vec3d, nanovdb::Vec3>::value;
+    test = nanovdb::util::is_specialization<nanovdb::Vec3d, nanovdb::math::Vec3>::value;
     EXPECT_TRUE(test);
-    test = nanovdb::is_same<double, nanovdb::Vec3d::ValueType>::value;
+    test = nanovdb::util::is_same<double, nanovdb::Vec3d::ValueType>::value;
     EXPECT_TRUE(test);
     test = nanovdb::TensorTraits<nanovdb::Vec3d>::IsVector;
     EXPECT_TRUE(test);
-    test = nanovdb::is_same<double, nanovdb::TensorTraits<nanovdb::Vec3d>::ElementType>::value;
+    test = nanovdb::util::is_same<double, nanovdb::TensorTraits<nanovdb::Vec3d>::ElementType>::value;
     EXPECT_TRUE(test);
-    test = nanovdb::is_same<double, nanovdb::FloatTraits<nanovdb::Vec3d>::FloatType>::value;
+    test = nanovdb::util::is_same<double, nanovdb::FloatTraits<nanovdb::Vec3d>::FloatType>::value;
     EXPECT_TRUE(test);
     EXPECT_EQ(size_t(3 * 8), sizeof(nanovdb::Vec3d));
 
@@ -1222,13 +1293,13 @@ TEST_F(TestNanoVDB, Vec3)
         EXPECT_EQ(size_t(3 * 4), sizeof(nanovdb::Vec3f));
         union {uint64_t a; nanovdb::Vec3f b;} c;
         EXPECT_EQ(2 * sizeof(uint64_t), sizeof(c));
-        EXPECT_EQ(nanovdb::AlignUp<8>(sizeof(nanovdb::Vec3f)), sizeof(c));
+        EXPECT_EQ(nanovdb::math::AlignUp<8>(sizeof(nanovdb::Vec3f)), sizeof(c));
     }
-}
+}// Vec3
 
 TEST_F(TestNanoVDB, Vec4)
 {
-    bool test = nanovdb::is_specialization<double, nanovdb::Vec4>::value;
+    bool test = nanovdb::util::is_specialization<double, nanovdb::math::Vec4>::value;
     EXPECT_FALSE(test);
     test = nanovdb::TensorTraits<double>::IsVector;
     EXPECT_FALSE(test);
@@ -1238,31 +1309,31 @@ TEST_F(TestNanoVDB, Vec4)
     EXPECT_EQ(0, rank);
     rank = nanovdb::TensorTraits<nanovdb::Vec3d>::Rank;
     EXPECT_EQ(1, rank);
-    test = nanovdb::is_same<double, nanovdb::FloatTraits<float>::FloatType>::value;
+    test = nanovdb::util::is_same<double, nanovdb::FloatTraits<float>::FloatType>::value;
     EXPECT_FALSE(test);
-    test = nanovdb::is_same<double, nanovdb::FloatTraits<double>::FloatType>::value;
+    test = nanovdb::util::is_same<double, nanovdb::FloatTraits<double>::FloatType>::value;
     EXPECT_TRUE(test);
-    test = nanovdb::is_same<float, nanovdb::FloatTraits<uint32_t>::FloatType>::value;
+    test = nanovdb::util::is_same<float, nanovdb::FloatTraits<uint32_t>::FloatType>::value;
     EXPECT_TRUE(test);
-    test = nanovdb::is_same<double, nanovdb::FloatTraits<uint64_t>::FloatType>::value;
+    test = nanovdb::util::is_same<double, nanovdb::FloatTraits<uint64_t>::FloatType>::value;
     EXPECT_TRUE(test);
-    test = nanovdb::is_specialization<nanovdb::Vec4R, nanovdb::Vec4>::value;
+    test = nanovdb::util::is_specialization<nanovdb::Vec4R, nanovdb::math::Vec4>::value;
     EXPECT_TRUE(test);
-    test = nanovdb::is_specialization<nanovdb::Vec3d, nanovdb::Vec4>::value;
+    test = nanovdb::util::is_specialization<nanovdb::Vec3d, nanovdb::math::Vec4>::value;
     EXPECT_FALSE(test);
-    test = nanovdb::is_same<double, nanovdb::Vec4R::ValueType>::value;
+    test = nanovdb::util::is_same<double, nanovdb::Vec4R::ValueType>::value;
     EXPECT_TRUE(test);
     test = nanovdb::TensorTraits<nanovdb::Vec3d>::IsVector;
     EXPECT_TRUE(test);
-    test = nanovdb::is_same<double, nanovdb::TensorTraits<nanovdb::Vec4R>::ElementType>::value;
+    test = nanovdb::util::is_same<double, nanovdb::TensorTraits<nanovdb::Vec4R>::ElementType>::value;
     EXPECT_TRUE(test);
-    test = nanovdb::is_same<double, nanovdb::TensorTraits<double>::ElementType>::value;
+    test = nanovdb::util::is_same<double, nanovdb::TensorTraits<double>::ElementType>::value;
     EXPECT_TRUE(test);
-    test = nanovdb::is_same<float, nanovdb::TensorTraits<float>::ElementType>::value;
+    test = nanovdb::util::is_same<float, nanovdb::TensorTraits<float>::ElementType>::value;
     EXPECT_TRUE(test);
-    test = nanovdb::is_same<uint32_t, nanovdb::TensorTraits<uint32_t>::ElementType>::value;
+    test = nanovdb::util::is_same<uint32_t, nanovdb::TensorTraits<uint32_t>::ElementType>::value;
     EXPECT_TRUE(test);
-    test = nanovdb::is_same<double, nanovdb::FloatTraits<nanovdb::Vec4R>::FloatType>::value;
+    test = nanovdb::util::is_same<double, nanovdb::FloatTraits<nanovdb::Vec4R>::FloatType>::value;
     EXPECT_TRUE(test);
     EXPECT_EQ(size_t(4 * 8), sizeof(nanovdb::Vec4R));
 
@@ -1302,7 +1373,7 @@ TEST_F(TestNanoVDB, Map)
 TEST_F(TestNanoVDB, Extrema)
 {
     { // int
-        nanovdb::Extrema<int> e(-1);
+        nanovdb::tools::Extrema<int> e(-1);
         EXPECT_EQ(-1, e.min());
         EXPECT_EQ(-1, e.max());
         e.add(-2);
@@ -1312,7 +1383,7 @@ TEST_F(TestNanoVDB, Extrema)
         EXPECT_EQ(5, e.max());
     }
     { // float
-        nanovdb::Extrema<float> e(-1.0f);
+        nanovdb::tools::Extrema<float> e(-1.0f);
         EXPECT_EQ(-1.0f, e.min());
         EXPECT_EQ(-1.0f, e.max());
         e.add(-2.0f);
@@ -1322,7 +1393,7 @@ TEST_F(TestNanoVDB, Extrema)
         EXPECT_EQ(5.0f, e.max());
     }
     { // Vec3f
-        nanovdb::Extrema<nanovdb::Vec3f> e(nanovdb::Vec3f(1.0f, 1.0f, 0.0f));
+        nanovdb::tools::Extrema<nanovdb::Vec3f> e(nanovdb::Vec3f(1.0f, 1.0f, 0.0f));
         EXPECT_EQ(nanovdb::Vec3f(1.0f, 1.0f, 0.0f), e.min());
         EXPECT_EQ(nanovdb::Vec3f(1.0f, 1.0f, 0.0f), e.max());
         e.add(nanovdb::Vec3f(1.0f, 0.0f, 0.0f));
@@ -1331,16 +1402,16 @@ TEST_F(TestNanoVDB, Extrema)
         EXPECT_EQ(nanovdb::Vec3f(1.0f, 0.0f, 0.0f), e.min());
         EXPECT_EQ(nanovdb::Vec3f(1.0f, 1.0f, 1.0f), e.max());
     }
-}
+}// Extrema
 
 TEST_F(TestNanoVDB, RayEmptyBBox)
 {
     using RealT = float;
-    using Vec3T = nanovdb::Vec3<RealT>;
+    using Vec3T = nanovdb::math::Vec3<RealT>;
     using CoordT = nanovdb::Coord;
-    using CoordBBoxT = nanovdb::BBox<CoordT>;
-    using BBoxT = nanovdb::BBox<Vec3T>;
-    using RayT = nanovdb::Ray<RealT>;
+    using CoordBBoxT = nanovdb::math::BBox<CoordT>;
+    using BBoxT = nanovdb::math::BBox<Vec3T>;
+    using RayT = nanovdb::math::Ray<RealT>;
 
     // test bbox clip
     const Vec3T dir(1.0, 0.0, 0.0);
@@ -1355,16 +1426,16 @@ TEST_F(TestNanoVDB, RayEmptyBBox)
     const BBoxT bbox2;
     EXPECT_TRUE(bbox2.empty());
     EXPECT_FALSE(ray.intersects(bbox2, t0, t1));
-}
+}// RayEmptyBBox
 
 TEST_F(TestNanoVDB, RayBasic)
 {
     using RealT = float;
-    using Vec3T = nanovdb::Vec3<RealT>;
+    using Vec3T = nanovdb::math::Vec3<RealT>;
     using CoordT = nanovdb::Coord;
-    using CoordBBoxT = nanovdb::BBox<CoordT>;
-    using BBoxT = nanovdb::BBox<Vec3T>;
-    using RayT = nanovdb::Ray<RealT>;
+    using CoordBBoxT = nanovdb::math::BBox<CoordT>;
+    using BBoxT = nanovdb::math::BBox<Vec3T>;
+    using RayT = nanovdb::math::Ray<RealT>;
 
     // test bbox clip
     const Vec3T dir(1.0, 0.0, 0.0);
@@ -1396,11 +1467,11 @@ TEST_F(TestNanoVDB, RayBasic)
 TEST_F(TestNanoVDB, Ray)
 {
     using RealT = float;
-    using Vec3T = nanovdb::Vec3<RealT>;
+    using Vec3T = nanovdb::math::Vec3<RealT>;
     using CoordT = nanovdb::Coord;
-    using CoordBBoxT = nanovdb::BBox<CoordT>;
-    using BBoxT = nanovdb::BBox<Vec3T>;
-    using RayT = nanovdb::Ray<RealT>;
+    using CoordBBoxT = nanovdb::math::BBox<CoordT>;
+    using BBoxT = nanovdb::math::BBox<Vec3T>;
+    using RayT = nanovdb::math::Ray<RealT>;
 
     // test bbox clip
     const Vec3T dir(-1.0, 2.0, 3.0);
@@ -1430,22 +1501,22 @@ TEST_F(TestNanoVDB, Ray)
     EXPECT_TRUE(!ray.clip(CoordBBoxT(CoordT(4, 2, 2), CoordT(6, 4, 6))));
     EXPECT_EQ(t0, ray.t0());
     EXPECT_EQ(t1, ray.t1());
-}
+}// Ray
 
 TEST_F(TestNanoVDB, HDDA)
 {
     using RealT = float;
     using CoordT = nanovdb::Coord;
-    using RayT = nanovdb::Ray<RealT>;
+    using RayT = nanovdb::math::Ray<RealT>;
     using Vec3T = RayT::Vec3T;
-    using DDAT = nanovdb::HDDA<RayT, CoordT>;
+    using DDAT = nanovdb::math::HDDA<RayT, CoordT>;
 
     { // basic test
         const RayT::Vec3T dir(1.0, 0.0, 0.0);
         const RayT::Vec3T eye(-1.0, 0.0, 0.0);
         const RayT        ray(eye, dir);
         DDAT              dda(ray, 1 << (3 + 4 + 5));
-        EXPECT_EQ(nanovdb::Delta<RealT>::value(), dda.time());
+        EXPECT_EQ(nanovdb::math::Delta<RealT>::value(), dda.time());
         EXPECT_EQ(1.0, dda.next());
         dda.step();
         EXPECT_EQ(1.0, dda.time());
@@ -1551,7 +1622,7 @@ TEST_F(TestNanoVDB, HDDA)
     auto acc = grid->getAccessor();
     CoordT ijk;
     float v0;
-    EXPECT_TRUE(nanovdb::ZeroCrossing( ray, acc, ijk, v0 ) );
+    EXPECT_TRUE(nanovdb::math::ZeroCrossing( ray, acc, ijk, v0 ) );
     std::cerr << "hit with v0 =" << v0 << " background = " << grid->tree().background() << std::endl;
   }
 #endif
@@ -1644,7 +1715,7 @@ TEST_F(TestNanoVDB, Mask)
     EXPECT_TRUE(++it2);
     EXPECT_EQ(123u, *it2);
     EXPECT_FALSE(++it2);
-}
+}// Mask
 
 TEST_F(TestNanoVDB, LeafNode)
 {
@@ -1662,7 +1733,7 @@ TEST_F(TestNanoVDB, LeafNode)
                   ),
               sizeof(LeafT));
     // this particular value type happens to be exactly 32B aligned!
-    EXPECT_EQ(nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(
+    EXPECT_EQ(nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(
                   3 * 4 + // mBBoxMin
                   4 * 1 + // mBBoxDif[3] + mFlags
                   8 * 8 + // mValueMask,
@@ -1711,14 +1782,14 @@ TEST_F(TestNanoVDB, LeafNode)
             }
         }
         EXPECT_TRUE(word != 0u);
-        bbox[0][1] = nanovdb::FindLowestOn(word) >> 3;
-        bbox[1][1] = nanovdb::FindHighestOn(word) >> 3;
+        bbox[0][1] = nanovdb::util::findLowestOn(word) >> 3;
+        bbox[1][1] = nanovdb::util::findHighestOn(word) >> 3;
 
         const uint8_t* p = reinterpret_cast<const uint8_t*>(&word);
         uint32_t       b = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7];
         EXPECT_TRUE(b != 0u);
-        bbox[0][2] = nanovdb::FindLowestOn(b);
-        bbox[1][2] = nanovdb::FindHighestOn(b);
+        bbox[0][2] = nanovdb::util::findLowestOn(b);
+        bbox[1][2] = nanovdb::util::findHighestOn(b);
         //std::cerr << bbox << std::endl;
         EXPECT_EQ(bbox[0], nanovdb::Coord(4, 0, 0));
         EXPECT_EQ(bbox[1], nanovdb::Coord(7, 7, 7));
@@ -1757,14 +1828,14 @@ TEST_F(TestNanoVDB, LeafNode)
             assert(word64);
             if (word64 == ~uint64_t(0))
                 return bbox; // early out of dense leaf
-            bbox[0][1] = nanovdb::FindLowestOn(word64) >> 3;
-            bbox[1][1] = nanovdb::FindHighestOn(word64) >> 3;
+            bbox[0][1] = nanovdb::util::findLowestOn(word64) >> 3;
+            bbox[1][1] = nanovdb::util::findHighestOn(word64) >> 3;
             const uint32_t *p = reinterpret_cast<const uint32_t*>(&word64), word32 = p[0] | p[1];
             const uint16_t *q = reinterpret_cast<const uint16_t*>(&word32), word16 = q[0] | q[1];
             const uint8_t * b = reinterpret_cast<const uint8_t*>(&word16), byte = b[0] | b[1];
             assert(byte);
-            bbox[0][2] = nanovdb::FindLowestOn(uint32_t(byte));
-            bbox[1][2] = nanovdb::FindHighestOn(uint32_t(byte));
+            bbox[0][2] = nanovdb::util::findLowestOn(uint32_t(byte));
+            bbox[1][2] = nanovdb::util::findHighestOn(uint32_t(byte));
             return bbox;
         }; // bboxOp
 
@@ -1816,7 +1887,7 @@ TEST_F(TestNanoVDB, LeafNodeBool)
     using LeafT = nanovdb::LeafNode<bool>;
     EXPECT_EQ(8u, LeafT::dim());
     EXPECT_EQ(512u, LeafT::voxelCount());
-    EXPECT_EQ(nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(8 * 8 + // mValueMask
+    EXPECT_EQ(nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(8 * 8 + // mValueMask
                                                        8 * 8 + // mMask
                                                        3 * 4 + // mBBoxMin
                                                        4 * 1), // mBBoxDif[3] + mFlags
@@ -1858,7 +1929,7 @@ TEST_F(TestNanoVDB, LeafNodeValueMask)
     //EXPECT_TRUE(LeafT::IgnoreValues);
     EXPECT_EQ(8u, LeafT::dim());
     EXPECT_EQ(512u, LeafT::voxelCount());
-    EXPECT_EQ(nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(8 * 8 + // mValueMask
+    EXPECT_EQ(nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(8 * 8 + // mValueMask
                                                        3 * 4 + // mBBoxMin
                                                        4 * 1), // mBBoxDif[3] + mFlags
               sizeof(LeafT));
@@ -1899,7 +1970,7 @@ TEST_F(TestNanoVDB, InternalNode)
     using NodeT = nanovdb::InternalNode<LeafT>;
     EXPECT_EQ(8 * 16u, NodeT::dim());
     //         2 x bit-masks         tiles    Vmin&Vmax offset + bbox + padding
-    EXPECT_EQ(nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(size_t(2 * (16 * 16 * 16 / 64) * 8 + 16 * 16 * 16 * 8 + 2 * 4 + 4 + 2 * 3 * 4 + 4)), NodeT::memUsage());
+    EXPECT_EQ(nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(size_t(2 * (16 * 16 * 16 / 64) * 8 + 16 * 16 * 16 * 8 + 2 * 4 + 4 + 2 * 3 * 4 + 4)), NodeT::memUsage());
 
     // an empty InternalNode
     std::unique_ptr<uint8_t[]> pool(new uint8_t[NodeT::DataType::memUsage()+NANOVDB_DATA_ALIGNMENT]);
@@ -1955,7 +2026,7 @@ TEST_F(TestNanoVDB, InternalNodeValueMask)
     ValueT mMaximum;
     alignas(32) Tile mTable[1u << (3 * LOG2DIM)];
     */
-    EXPECT_EQ(nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(size_t(24 + 4 + 4 + 512 + 512 + 4 + 4 + (16 * 16 * 16) * 8)), NodeT::memUsage());
+    EXPECT_EQ(nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(size_t(24 + 4 + 4 + 512 + 512 + 4 + 4 + (16 * 16 * 16) * 8)), NodeT::memUsage());
 
     // an empty InternalNode
     std::unique_ptr<uint8_t[]> pool(new uint8_t[NodeT::DataType::memUsage()+NANOVDB_DATA_ALIGNMENT]);
@@ -1998,7 +2069,7 @@ TEST_F(TestNanoVDB, InternalNodeValueMask)
     {// check padding in lower internal nodes
       using LowerT = nanovdb::NanoLower<nanovdb::ValueMask>;
       EXPECT_EQ(16u*8u, LowerT::dim());
-      EXPECT_EQ(nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(32 + 2*512 +// bbox/flags/masks
+      EXPECT_EQ(nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(32 + 2*512 +// bbox/flags/masks
                                                          4  + // mMin/max/avg/sdt
                                                          16*16*16*8), // mTable
                                                          sizeof(LowerT));
@@ -2011,7 +2082,7 @@ TEST_F(TestNanoVDB, InternalNodeValueMask)
       uint8_t *start = reinterpret_cast<uint8_t*>(&(data->mStdDevi)+1);
       uint8_t *end = reinterpret_cast<uint8_t*>(data->mTable);
       //std::cerr << "Padding = " << (end - start) << std::endl;
-      //std::cerr << "Expected = " << (nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(4)-4) << std::endl;
+      //std::cerr << "Expected = " << (nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(4)-4) << std::endl;
       EXPECT_EQ(end-start, 28);// padding is 28 bytes
 
       // use padding for an offset and check that it doesn't interfere with other data
@@ -2026,7 +2097,7 @@ TEST_F(TestNanoVDB, InternalNodeValueMask)
     {// check padding in upper internal nodes
       using UpperT = nanovdb::NanoUpper<nanovdb::ValueMask>;
       EXPECT_EQ(32u*128u, UpperT::dim());
-      EXPECT_EQ(nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(32 + 2*4096 +// bbox/flags/masks
+      EXPECT_EQ(nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(32 + 2*4096 +// bbox/flags/masks
                                                          4  + // mMin/max/avg/sdt
                                                          32*32*32*8), // mTable
                                                          sizeof(UpperT));
@@ -2039,7 +2110,7 @@ TEST_F(TestNanoVDB, InternalNodeValueMask)
       uint8_t *start = reinterpret_cast<uint8_t*>(&(data->mStdDevi)+1);
       uint8_t *end = reinterpret_cast<uint8_t*>(data->mTable);
       //std::cerr << "Padding = " << (end - start) << std::endl;
-      //std::cerr << "Expected = " << (nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(4)-4) << std::endl;
+      //std::cerr << "Expected = " << (nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(4)-4) << std::endl;
       EXPECT_EQ(end-start, 28);// padding is 28 bytes
 
       // use padding for an offset and check that it doesn't interfere with other data
@@ -2062,7 +2133,7 @@ TEST_F(TestNanoVDB, RootNode)
     using CoordT = NodeT3::CoordType;
     using KeyT   = NodeT3::DataType::KeyT;
 
-    EXPECT_EQ(nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(sizeof(nanovdb::CoordBBox) + sizeof(uint32_t) + (5 * sizeof(float))), NodeT3::memUsage(0));
+    EXPECT_EQ(nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(sizeof(nanovdb::CoordBBox) + sizeof(uint32_t) + (5 * sizeof(float))), NodeT3::memUsage(0));
 
     // an empty RootNode
     std::unique_ptr<uint8_t[]> pool(new uint8_t[NodeT3::memUsage(0)+NANOVDB_DATA_ALIGNMENT]);
@@ -2079,7 +2150,7 @@ TEST_F(TestNanoVDB, RootNode)
     EXPECT_EQ(1.234f, root->minimum());
     EXPECT_EQ(1.234f, root->maximum());
     EXPECT_EQ(0u, root->tileCount());
-    EXPECT_EQ(nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(sizeof(nanovdb::CoordBBox) + sizeof(uint32_t) + (5 * sizeof(float))), root->memUsage()); // background, min, max, tileCount + bbox
+    EXPECT_EQ(nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(sizeof(nanovdb::CoordBBox) + sizeof(uint32_t) + (5 * sizeof(float))), root->memUsage()); // background, min, max, tileCount + bbox
     EXPECT_EQ(1.234f, root->getValue(CoordT(1, 2, 3)));
     EXPECT_EQ(1.234f, root->getValue(1, 2, 3));
 
@@ -2215,8 +2286,8 @@ TEST_F(TestNanoVDB, Offsets)
         offset += 8;
         EXPECT_EQ(NANOVDB_OFFSETOF(nanovdb::GridData, mData2), offset);
         offset += 8;
-        //std::cerr << "GridData padding at end = " << (nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(offset)-offset) << std::endl;
-        //offset = nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(offset);
+        //std::cerr << "GridData padding at end = " << (nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(offset)-offset) << std::endl;
+        //offset = nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(offset);
         //std::cerr << "GridData: Offset = " << offset << std::endl;
         EXPECT_EQ(offset, (int)sizeof(nanovdb::GridData));
     }
@@ -2236,8 +2307,8 @@ TEST_F(TestNanoVDB, Offsets)
         offset += 12;
         EXPECT_EQ(NANOVDB_OFFSETOF(nanovdb::TreeData, mVoxelCount), offset);
         offset += 8;
-        //std::cerr << "TreeData padding at end = " << (nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(offset)-offset) << std::endl;
-        offset = nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(offset);
+        //std::cerr << "TreeData padding at end = " << (nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(offset)-offset) << std::endl;
+        offset = nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(offset);
         //std::cerr << "TreeData: Offset = " << offset << std::endl;
         EXPECT_EQ(offset, (int)sizeof(nanovdb::TreeData));
     }
@@ -2256,7 +2327,7 @@ TYPED_TEST(TestOffsets, NanoVDB)
     //std::cerr << "Alignment = " << ALIGNMENT << " sizeof(ValueType) = " << sizeof(ValueType) << std::endl;
     {// check memory layout of RootData
         using DataT = typename nanovdb::NanoRoot<BuildType>::DataType;
-        bool test = nanovdb::is_same<StatsT, typename DataT::StatsT>::value;
+        bool test = nanovdb::util::is_same<StatsT, typename DataT::StatsT>::value;
         EXPECT_TRUE(test);
         int offsets[] = {
             NANOVDB_OFFSETOF(DataT, mBBox),
@@ -2274,30 +2345,30 @@ TYPED_TEST(TestOffsets, NanoVDB)
         offset += 24;// 2 * 3 * 4 bytes = 24 bytes
         EXPECT_EQ(*p++, offset);// mTableSize
         offset += sizeof(uint32_t);
-        offset = nanovdb::AlignUp<ALIGNMENT>(offset);
+        offset = nanovdb::math::AlignUp<ALIGNMENT>(offset);
         EXPECT_EQ(*p++, offset);// mBackground
         offset += sizeof(ValueType);
         EXPECT_EQ(*p++, offset);// mMinimum
         offset += sizeof(ValueType);
         EXPECT_EQ(*p++, offset);// mMaximum
         offset += sizeof(ValueType);
-        offset = nanovdb::AlignUp<ALIGNMENT>(offset);
+        offset = nanovdb::math::AlignUp<ALIGNMENT>(offset);
         EXPECT_EQ(*p++, offset);// mAverage
         offset += sizeof(StatsT);
-        offset = nanovdb::AlignUp<ALIGNMENT>(offset);
+        offset = nanovdb::math::AlignUp<ALIGNMENT>(offset);
         EXPECT_EQ(*p++, offset);// mStdDevi
         offset += sizeof(StatsT);
-        //std::cerr << "RootData<"<<nanovdb::toStr(nanovdb::mapToGridType<TypeParam>())
-        //          <<"> is padding with " << (nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(offset)-offset)
+        //std::cerr << "RootData<"<<nanovdb::toStr(nanovdb::toGridType<TypeParam>())
+        //          <<"> is padding with " << (nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(offset)-offset)
         //          << " bytes" << std::endl;
         //std::cerr << "Is padded: " << (DataT::padding() ? "yes" : "no") << std::endl;
-        EXPECT_EQ(DataT::padding()>0,  offset != nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(offset));
-        offset = nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(offset);
+        EXPECT_EQ(DataT::padding()>0,  offset != nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(offset));
+        offset = nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(offset);
         EXPECT_EQ(offset, (int)sizeof(DataT));// size of RootData
     }
     {// check  memory layout of upper internal nodes
         using DataT = typename nanovdb::NanoUpper<BuildType>::DataType;
-        bool test = nanovdb::is_same<StatsT, typename DataT::StatsT>::value;
+        bool test = nanovdb::util::is_same<StatsT, typename DataT::StatsT>::value;
         EXPECT_TRUE(test);
         int offsets[] = {
             NANOVDB_OFFSETOF(DataT, mBBox),
@@ -2325,34 +2396,34 @@ TYPED_TEST(TestOffsets, NanoVDB)
         offset += sizeof(ValueType);
         EXPECT_EQ(*p++, offset);
         offset += sizeof(ValueType);
-        if (offset!=nanovdb::AlignUp<ALIGNMENT>(offset)) is_padded=true;
-        offset = nanovdb::AlignUp<ALIGNMENT>(offset);
+        if (offset!=nanovdb::math::AlignUp<ALIGNMENT>(offset)) is_padded=true;
+        offset = nanovdb::math::AlignUp<ALIGNMENT>(offset);
         EXPECT_EQ(*p++, offset);
         offset += sizeof(StatsT);
-        if (offset!=nanovdb::AlignUp<ALIGNMENT>(offset)) is_padded=true;
-        offset = nanovdb::AlignUp<ALIGNMENT>(offset);
+        if (offset!=nanovdb::math::AlignUp<ALIGNMENT>(offset)) is_padded=true;
+        offset = nanovdb::math::AlignUp<ALIGNMENT>(offset);
         EXPECT_EQ(*p++, offset);
         offset += sizeof(StatsT);
-        if (offset!=nanovdb::AlignUp<32>(offset)) is_padded=true;
-        offset = nanovdb::AlignUp<32>(offset);
+        if (offset!=nanovdb::math::AlignUp<32>(offset)) is_padded=true;
+        offset = nanovdb::math::AlignUp<32>(offset);
         EXPECT_EQ(*p++, offset);
-        if (sizeof(ValueType)!=nanovdb::AlignUp<8>(sizeof(ValueType))) is_padded=true;
-        const size_t tile_size = nanovdb::AlignUp<8>(sizeof(ValueType));
+        if (sizeof(ValueType)!=nanovdb::math::AlignUp<8>(sizeof(ValueType))) is_padded=true;
+        const size_t tile_size = nanovdb::math::AlignUp<8>(sizeof(ValueType));
         EXPECT_EQ(sizeof(typename DataT::Tile), tile_size);
         offset += (32*32*32)*tile_size;
-        if (offset!=nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(offset)) is_padded=true;
+        if (offset!=nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(offset)) is_padded=true;
         //if (is_padded != (DataT::padding()>0)) {
-        //    std::cerr << "Upper InternalData<" << nanovdb::toStr(nanovdb::mapToGridType<TypeParam>())
+        //    std::cerr << "Upper InternalData<" << nanovdb::toStr(nanovdb::toGridType<TypeParam>())
         //              << "> is padding: " << (DataT::padding() ? "yes" : "no") << std::endl;
         //    std::cerr << "is_padded: " << (is_padded>0 ? "yes" : "no") << std::endl;
         //}
         EXPECT_EQ(is_padded, bool(DataT::padding()));
-        offset = nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(offset);
+        offset = nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(offset);
         EXPECT_EQ(sizeof(DataT), (size_t)offset);
     }
     {// check  memory of lower internal nodes
         using DataT = typename nanovdb::NanoLower<BuildType>::DataType;
-        bool test = nanovdb::is_same<StatsT, typename DataT::StatsT>::value;
+        bool test = nanovdb::util::is_same<StatsT, typename DataT::StatsT>::value;
         EXPECT_TRUE(test);
         int offsets[] = {
             NANOVDB_OFFSETOF(DataT, mBBox),
@@ -2380,34 +2451,34 @@ TYPED_TEST(TestOffsets, NanoVDB)
         offset += sizeof(ValueType);
         EXPECT_EQ(*p++, offset);
         offset += sizeof(ValueType);
-        if (offset!=nanovdb::AlignUp<ALIGNMENT>(offset)) is_padded=true;
-        offset = nanovdb::AlignUp<ALIGNMENT>(offset);
+        if (offset!=nanovdb::math::AlignUp<ALIGNMENT>(offset)) is_padded=true;
+        offset = nanovdb::math::AlignUp<ALIGNMENT>(offset);
         EXPECT_EQ(*p++, offset);
         offset += sizeof(StatsT);
-        if (offset!=nanovdb::AlignUp<ALIGNMENT>(offset)) is_padded=true;
-        offset = nanovdb::AlignUp<ALIGNMENT>(offset);
+        if (offset!=nanovdb::math::AlignUp<ALIGNMENT>(offset)) is_padded=true;
+        offset = nanovdb::math::AlignUp<ALIGNMENT>(offset);
         EXPECT_EQ(*p++, offset);
         offset += sizeof(StatsT);
-        if (offset!=nanovdb::AlignUp<32>(offset)) is_padded=true;
-        offset = nanovdb::AlignUp<32>(offset);
+        if (offset!=nanovdb::math::AlignUp<32>(offset)) is_padded=true;
+        offset = nanovdb::math::AlignUp<32>(offset);
         EXPECT_EQ(*p++, offset);
-        if (sizeof(ValueType)!=nanovdb::AlignUp<8>(sizeof(ValueType))) is_padded=true;
-        const size_t tile_size = nanovdb::AlignUp<8>(sizeof(ValueType));
+        if (sizeof(ValueType)!=nanovdb::math::AlignUp<8>(sizeof(ValueType))) is_padded=true;
+        const size_t tile_size = nanovdb::math::AlignUp<8>(sizeof(ValueType));
         EXPECT_EQ(sizeof(typename DataT::Tile), tile_size);
         offset += (16*16*16)*tile_size;
-        if (offset!=nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(offset)) is_padded=true;
+        if (offset!=nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(offset)) is_padded=true;
         //if (is_padded != (DataT::padding()>0)) {
-        //    std::cerr << "Lower InternalData<" << nanovdb::toStr(nanovdb::mapToGridType<TypeParam>())
+        //    std::cerr << "Lower InternalData<" << nanovdb::toStr(nanovdb::toGridType<TypeParam>())
         //              << "> is padding: " << (DataT::padding() ? "yes" : "no") << std::endl;
         //    std::cerr << "is_padded: " << (is_padded>0 ? "yes" : "no") << std::endl;
         //}
         EXPECT_EQ(is_padded, bool(DataT::padding()));
-        offset = nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(offset);
+        offset = nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(offset);
         EXPECT_EQ(sizeof(DataT), (size_t)offset);
     }
     {// check  memory of leaf nodes
         using DataT = typename nanovdb::LeafNode<BuildType>::DataType;
-        bool test = nanovdb::is_same<StatsT, typename DataT::FloatType>::value;
+        bool test = nanovdb::util::is_same<StatsT, typename DataT::FloatType>::value;
         EXPECT_TRUE(test);
         int offsets[] = {
             NANOVDB_OFFSETOF(DataT, mBBoxMin),
@@ -2427,10 +2498,37 @@ TYPED_TEST(TestOffsets, NanoVDB)
         EXPECT_EQ(*p++, offset);
         offset += 64;// = 8*8*8/8
         checkLeaf<TypeParam>(offset);
-        offset = nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(offset);
+        offset = nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(offset);
         EXPECT_EQ(sizeof(DataT), (size_t)offset);
-        //std::cerr << "LeafData<" << nanovdb::toStr(nanovdb::mapToGridType<TypeParam>())
-        //          <<"> is padding: " << (DataT::padding() ? "yes" : "no") << std::endl;
+#if 0// disable with 0
+        char str[30];
+        std::cerr << "LeafData<" << nanovdb::toStr(str, nanovdb::toGridType<TypeParam>())
+                  << "> is padded: " << (DataT::padding() ? "yes" : "no") << std::endl;
+/*
+        LeafData<float> is padded: no
+        LeafData<double> is padded: yes
+        LeafData<Float4> is padded: no
+        LeafData<Float8> is padded: no
+        LeafData<Float16> is padded: no
+        LeafData<FloatN> is padded: no
+        LeafData<int16> is padded: yes
+        LeafData<int32> is padded: no
+        LeafData<int64> is padded: yes
+        LeafData<Vec3f> is padded: yes
+        LeafData<Vec3d> is padded: yes
+        LeafData<Mask> is padded: no
+        LeafData<Index> is padded: no
+        LeafData<OnIndex> is padded: no
+        LeafData<IndexMask> is padded: no
+        LeafData<OnIndexMask> is padded: no
+        LeafData<bool> is padded: no
+        LeafData<PointIndex> is padded: no
+        LeafData<Vec3u8> is padded: yes
+        LeafData<Vec3u16> is padded: yes
+        LeafData<int16> is padded: yes
+        LeafData<uint32> is padded: no
+*/
+#endif
     }
 }// TestOffsets NanoVDB
 
@@ -2445,13 +2543,13 @@ void checkLeaf(int &offset)
     offset += sizeof(ValueType);
     EXPECT_EQ(NANOVDB_OFFSETOF(DataT, mMaximum), offset);
     offset += sizeof(ValueType);
-    offset = nanovdb::AlignUp<ALIGNMENT>(offset);
+    offset = nanovdb::math::AlignUp<ALIGNMENT>(offset);
     EXPECT_EQ(NANOVDB_OFFSETOF(DataT, mAverage), offset);
     offset += sizeof(StatsT);
-    offset = nanovdb::AlignUp<ALIGNMENT>(offset);
+    offset = nanovdb::math::AlignUp<ALIGNMENT>(offset);
     EXPECT_EQ(NANOVDB_OFFSETOF(DataT, mStdDevi), offset);
     offset += sizeof(StatsT);
-    offset = nanovdb::AlignUp<32>(offset);
+    offset = nanovdb::math::AlignUp<32>(offset);
     EXPECT_EQ(NANOVDB_OFFSETOF(DataT, mValues), offset);
     offset += (8*8*8)*sizeof(ValueType);
 }
@@ -2534,7 +2632,7 @@ void checkLeaf<nanovdb::Fp4>(int &offset)
     offset += sizeof(uint16_t);
     EXPECT_EQ(NANOVDB_OFFSETOF(DataT, mDev), offset);
     offset += sizeof(uint16_t);
-    offset = nanovdb::AlignUp<32>(offset);
+    offset = nanovdb::math::AlignUp<32>(offset);
     EXPECT_EQ(NANOVDB_OFFSETOF(DataT, mCode), offset);
     offset += 256*sizeof(uint8_t);
 }
@@ -2555,7 +2653,7 @@ void checkLeaf<nanovdb::Fp8>(int &offset)
     offset += sizeof(uint16_t);
     EXPECT_EQ(NANOVDB_OFFSETOF(DataT, mDev), offset);
     offset += sizeof(uint16_t);
-    offset = nanovdb::AlignUp<32>(offset);
+    offset = nanovdb::math::AlignUp<32>(offset);
     EXPECT_EQ(NANOVDB_OFFSETOF(DataT, mCode), offset);
     offset += 512*sizeof(uint8_t);
 }
@@ -2576,7 +2674,7 @@ void checkLeaf<nanovdb::Fp16>(int &offset)
     offset += sizeof(uint16_t);
     EXPECT_EQ(NANOVDB_OFFSETOF(DataT, mDev), offset);
     offset += sizeof(uint16_t);
-    offset = nanovdb::AlignUp<32>(offset);
+    offset = nanovdb::math::AlignUp<32>(offset);
     EXPECT_EQ(NANOVDB_OFFSETOF(DataT, mCode), offset);
     offset += 512*sizeof(uint16_t);
 }
@@ -2597,7 +2695,7 @@ void checkLeaf<nanovdb::FpN>(int &offset)
     offset += sizeof(uint16_t);
     EXPECT_EQ(NANOVDB_OFFSETOF(DataT, mDev), offset);
     offset += sizeof(uint16_t);
-    offset = nanovdb::AlignUp<32>(offset);
+    offset = nanovdb::math::AlignUp<32>(offset);
 }
 
 template<>
@@ -2608,7 +2706,7 @@ void checkLeaf<nanovdb::Point>(int &offset)
     offset += sizeof(uint64_t);
     EXPECT_EQ(NANOVDB_OFFSETOF(DataT, mPointCount), offset);
     offset += sizeof(uint64_t);
-    offset = nanovdb::AlignUp<32>(offset);
+    offset = nanovdb::math::AlignUp<32>(offset);
     EXPECT_EQ(NANOVDB_OFFSETOF(DataT, mValues), offset);
     offset += (8*8*8)*sizeof(uint16_t);
 }
@@ -2648,8 +2746,8 @@ TEST_F(TestNanoVDB, BasicGrid)
         */
     }
 
-    EXPECT_EQ(sizeof(GridT), nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(8 + 8 + 4 + 4 + 8 + nanovdb::GridData::MaxNameSize + 48 + sizeof(nanovdb::Map) + 24 + 4 + 4 + 8 + 4));
-    EXPECT_EQ(sizeof(TreeT), nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(4*8 + 3*4 + 3*4 + 8));
+    EXPECT_EQ(sizeof(GridT), nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(8 + 8 + 4 + 4 + 8 + nanovdb::GridData::MaxNameSize + 48 + sizeof(nanovdb::Map) + 24 + 4 + 4 + 8 + 4));
+    EXPECT_EQ(sizeof(TreeT), nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(4*8 + 3*4 + 3*4 + 8));
     EXPECT_EQ(sizeof(TreeT), size_t(4*8 + 3*4 + 3*4 + 8));// should already be 32 byte aligned
 
     uint64_t bytes[6] = {GridT::memUsage(), TreeT::memUsage(), RootT::memUsage(1), NodeT2::memUsage(), NodeT1::memUsage(), LeafT::DataType::memUsage()};
@@ -2774,7 +2872,7 @@ TEST_F(TestNanoVDB, BasicGrid)
             data->mMap.set(mat, invMat, 1.0);
             data->mGridClass = nanovdb::GridClass::Unknown;
             data->mGridType = nanovdb::GridType::Float;
-            data->mMagic = NANOVDB_MAGIC_NUMBER;
+            data->mMagic = NANOVDB_MAGIC_NUMB;
             data->mVersion = nanovdb::Version();
 #endif
             memcpy(data->mGridName, name.c_str(), name.size() + 1);
@@ -2940,10 +3038,10 @@ TEST_F(TestNanoVDB, BasicGrid)
 TEST_F(TestNanoVDB, GridBuilderEmpty)
 {
     { // empty grid
-        using SrcGridT = nanovdb::build::Grid<float>;
+        using SrcGridT = nanovdb::tools::build::Grid<float>;
         SrcGridT srcGrid(0.0f, "test");
         auto srcAcc = srcGrid.getAccessor();
-        auto handle = nanovdb::createNanoGrid(srcGrid);
+        auto handle = nanovdb::tools::createNanoGrid(srcGrid);
         EXPECT_TRUE(handle);
         auto* meta = handle.gridMetaData();
         EXPECT_TRUE(meta);
@@ -2982,10 +3080,10 @@ TEST_F(TestNanoVDB, GridBuilderEmpty)
 TEST_F(TestNanoVDB, BuilderGridEmpty)
 {
     { // empty grid
-        using SrcGridT = nanovdb::build::Grid<float>;
+        using SrcGridT = nanovdb::tools::build::Grid<float>;
         SrcGridT grid(0.0f, "test");
         auto srcAcc = grid.getAccessor();
-        auto handle = nanovdb::createNanoGrid(grid);
+        auto handle = nanovdb::tools::createNanoGrid(grid);
         EXPECT_TRUE(handle);
         auto* meta = handle.gridMetaData();
         EXPECT_TRUE(meta);
@@ -3024,7 +3122,7 @@ TEST_F(TestNanoVDB, BuilderGridEmpty)
 TEST_F(TestNanoVDB, CreateNanoGrid_Basic1)
 {
     { // 1 grid point
-        using SrcGridT = nanovdb::build::Grid<float>;
+        using SrcGridT = nanovdb::tools::build::Grid<float>;
         const nanovdb::Coord ijk(1,2,3);
         SrcGridT grid(0.0f);
         auto srcAcc = grid.getAccessor();
@@ -3036,7 +3134,7 @@ TEST_F(TestNanoVDB, CreateNanoGrid_Basic1)
         EXPECT_EQ(1.0f, srcAcc.getValue(ijk));
         EXPECT_EQ(1.0f, srcAcc.getValue(1,2,3));
 
-        auto handle = nanovdb::createNanoGrid(grid);
+        auto handle = nanovdb::tools::createNanoGrid(grid);
         EXPECT_TRUE(handle);
         auto* meta = handle.gridMetaData();
         EXPECT_TRUE(meta);
@@ -3075,7 +3173,7 @@ TEST_F(TestNanoVDB, CreateNanoGrid_Basic1)
 TEST_F(TestNanoVDB, CreateNanoGrid_addTile)
 {
     { // 1 grid point and 1 tile
-        using SrcGridT = nanovdb::build::Grid<float>;
+        using SrcGridT = nanovdb::tools::build::Grid<float>;
         const nanovdb::Coord ijk(1,2,3);
         SrcGridT grid(0.0f);
         auto srcAcc = grid.getAccessor();
@@ -3093,7 +3191,7 @@ TEST_F(TestNanoVDB, CreateNanoGrid_addTile)
         EXPECT_EQ(2.0f, srcAcc.getValue(ijk2));
         EXPECT_EQ(2.0f, srcAcc.getValue(-1,-2,-3));
 
-        auto handle = nanovdb::createNanoGrid(grid);
+        auto handle = nanovdb::tools::createNanoGrid(grid);
         EXPECT_TRUE(handle);
         auto* meta = handle.gridMetaData();
         EXPECT_TRUE(meta);
@@ -3127,7 +3225,7 @@ TEST_F(TestNanoVDB, CreateNanoGrid_addTile)
 TEST_F(TestNanoVDB, GridBuilderValueMask)
 {
     { // 1 grid point
-        using SrcGridT = nanovdb::build::Grid<nanovdb::ValueMask>;
+        using SrcGridT = nanovdb::tools::build::Grid<nanovdb::ValueMask>;
         const nanovdb::Coord ijk(1,2,3);
         SrcGridT grid(false);
         auto srcAcc = grid.getAccessor();
@@ -3137,7 +3235,7 @@ TEST_F(TestNanoVDB, GridBuilderValueMask)
         EXPECT_EQ(1u, nodeCount[1]);
         EXPECT_EQ(1u, nodeCount[2]);
         EXPECT_EQ(true, srcAcc.getValue(ijk));
-        auto handle = nanovdb::createNanoGrid(grid);
+        auto handle = nanovdb::tools::createNanoGrid(grid);
         EXPECT_TRUE(handle);
         auto* meta = handle.gridMetaData();
         EXPECT_TRUE(meta);
@@ -3178,7 +3276,7 @@ TEST_F(TestNanoVDB, GridBuilderValueMask)
 TEST_F(TestNanoVDB, GridBuilderBasic2)
 {
     { // 2 grid points
-        using SrcGridT = nanovdb::build::Grid<float>;
+        using SrcGridT = nanovdb::tools::build::Grid<float>;
         SrcGridT grid(0.0f, "test");
         auto srcAcc = grid.getAccessor();
         const nanovdb::Coord ijk1(1,2,3), ijk2(2,-2,9);
@@ -3191,7 +3289,7 @@ TEST_F(TestNanoVDB, GridBuilderBasic2)
         EXPECT_EQ(2u, nodeCount[1]);
         EXPECT_EQ(2u, nodeCount[2]);
 
-        nanovdb::build::NodeManager<SrcGridT> srcMgr(grid);
+        nanovdb::tools::build::NodeManager<SrcGridT> srcMgr(grid);
         EXPECT_EQ(2u, srcMgr.nodeCount(0));
         EXPECT_EQ(2u, srcMgr.nodeCount(1));
         EXPECT_EQ(2u, srcMgr.nodeCount(2));
@@ -3201,7 +3299,7 @@ TEST_F(TestNanoVDB, GridBuilderBasic2)
         //for (int i=0;i<srcMgr.nodeCount(1);++i) std::cerr << "Lower #"<<i<<" origin="<<srcMgr.node<1>(i).origin()<<std::endl;
         //for (int i=0;i<srcMgr.nodeCount(0);++i) std::cerr << "Leaf #"<<i<<" origin="<<srcMgr.node<0>(i).origin()<<std::endl;
 
-        auto handle = nanovdb::createNanoGrid(grid);
+        auto handle = nanovdb::tools::createNanoGrid(grid);
         EXPECT_TRUE(handle);
         auto* meta = handle.gridMetaData();
         EXPECT_TRUE(meta);
@@ -3246,7 +3344,7 @@ TEST_F(TestNanoVDB, GridBuilderBasic2)
         EXPECT_EQ(-1.0f, dstAcc.getValue(ijk2));
         EXPECT_EQ( 1.0f, dstAcc.getValue(ijk1));
 
-        const nanovdb::BBox<nanovdb::Vec3d> indexBBox = dstGrid->indexBBox();
+        const nanovdb::math::BBox<nanovdb::Vec3d> indexBBox = dstGrid->indexBBox();
         EXPECT_DOUBLE_EQ( 1.0, indexBBox[0][0]);
         EXPECT_DOUBLE_EQ(-2.0, indexBBox[0][1]);
         EXPECT_DOUBLE_EQ( 3.0, indexBBox[0][2]);
@@ -3268,7 +3366,7 @@ TEST_F(TestNanoVDB, GridBuilderBasic2)
 TEST_F(TestNanoVDB, GridBuilderPrune)
 {
     {
-        using SrcGridT = nanovdb::build::Grid<float>;
+        using SrcGridT = nanovdb::tools::build::Grid<float>;
         SrcGridT srcGrid(0.0f, "test");
         auto srcAcc = srcGrid.getAccessor();
         const nanovdb::CoordBBox bbox(nanovdb::Coord(0), nanovdb::Coord(8*16-1));
@@ -3279,7 +3377,7 @@ TEST_F(TestNanoVDB, GridBuilderPrune)
             EXPECT_EQ(1.0f, srcAcc.getValue(*ijk));
             EXPECT_TRUE(srcAcc.isActive(*ijk));
         }
-        auto handle = nanovdb::createNanoGrid(srcGrid);
+        auto handle = nanovdb::tools::createNanoGrid(srcGrid);
         EXPECT_TRUE(handle);
         auto* meta = handle.gridMetaData();
         EXPECT_TRUE(meta);
@@ -3306,7 +3404,7 @@ TEST_F(TestNanoVDB, GridBuilderPrune)
         }
         EXPECT_EQ( 0.0f, dstAcc.getValue(nanovdb::Coord(2, -2, 9)));
 
-        const nanovdb::BBox<nanovdb::Vec3d> indexBBox = dstGrid->indexBBox();
+        const nanovdb::math::BBox<nanovdb::Vec3d> indexBBox = dstGrid->indexBBox();
         EXPECT_DOUBLE_EQ(   0.0, indexBBox[0][0]);
         EXPECT_DOUBLE_EQ(   0.0, indexBBox[0][1]);
         EXPECT_DOUBLE_EQ(   0.0, indexBBox[0][2]);
@@ -3335,9 +3433,9 @@ TEST_F(TestNanoVDB, GridBuilderPrune)
 TEST_F(TestNanoVDB, GridBuilder_Vec3f)
 {
     using VoxelT = nanovdb::Vec3f;
-    EXPECT_EQ(nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(12 + 3 + 1 + 2*4 + 64 + 3*(2*4 + 512*4)), sizeof(nanovdb::NanoLeaf<VoxelT>));
+    EXPECT_EQ(nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(12 + 3 + 1 + 2*4 + 64 + 3*(2*4 + 512*4)), sizeof(nanovdb::NanoLeaf<VoxelT>));
     { // 3 grid point
-        using SrcGridT = nanovdb::build::Grid<VoxelT>;
+        using SrcGridT = nanovdb::tools::build::Grid<VoxelT>;
         SrcGridT srcGrid(VoxelT(0.0f));
         auto srcAcc = srcGrid.getAccessor();
         srcAcc.setValue(nanovdb::Coord(  1,  2,  3), nanovdb::Vec3f(1.0f));
@@ -3349,7 +3447,7 @@ TEST_F(TestNanoVDB, GridBuilder_Vec3f)
         EXPECT_EQ(nanovdb::Vec3f(2.0f), srcAcc.getValue(nanovdb::Coord(-10, 20,-50)));
         EXPECT_EQ(nanovdb::Vec3f(3.0f), srcAcc.getValue(nanovdb::Coord( 50,-12, 30)));
 
-        auto handle = nanovdb::createNanoGrid(srcGrid, nanovdb::StatsMode::All);
+        auto handle = nanovdb::tools::createNanoGrid(srcGrid, nanovdb::tools::StatsMode::All);
         EXPECT_TRUE(handle);
         auto* meta = handle.gridMetaData();
         EXPECT_TRUE(meta);
@@ -3358,7 +3456,7 @@ TEST_F(TestNanoVDB, GridBuilder_Vec3f)
         EXPECT_EQ(uint32_t(NANOVDB_MINOR_VERSION_NUMBER), meta->version().getMinor());
         EXPECT_EQ(uint32_t(NANOVDB_PATCH_VERSION_NUMBER), meta->version().getPatch());
         EXPECT_EQ("", std::string(meta->shortGridName()));
-        EXPECT_EQ(nanovdb::mapToGridType<VoxelT>(), meta->gridType());
+        EXPECT_EQ(nanovdb::toGridType<VoxelT>(), meta->gridType());
         EXPECT_EQ(nanovdb::GridClass::Unknown, meta->gridClass());
         auto* dstGrid = handle.grid<VoxelT>();
         EXPECT_TRUE(dstGrid);
@@ -3400,9 +3498,9 @@ TEST_F(TestNanoVDB, GridBuilder_Vec3f)
 TEST_F(TestNanoVDB, GridBuilder_Vec4f)
 {
     using VoxelT = nanovdb::Vec4f;
-    EXPECT_EQ(nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(12 + 3 + 1 + 2*4 + 64 + 4*(2*4 + 512*4)), sizeof(nanovdb::NanoLeaf<VoxelT>));
+    EXPECT_EQ(nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(12 + 3 + 1 + 2*4 + 64 + 4*(2*4 + 512*4)), sizeof(nanovdb::NanoLeaf<VoxelT>));
     { // 3 grid point
-        using SrcGridT = nanovdb::build::Grid<VoxelT>;
+        using SrcGridT = nanovdb::tools::build::Grid<VoxelT>;
         SrcGridT srcGrid(VoxelT(0.0f));
         auto srcAcc = srcGrid.getAccessor();
         srcAcc.setValue(nanovdb::Coord(  1,  2,  3), nanovdb::Vec4f(1.0f));
@@ -3414,7 +3512,7 @@ TEST_F(TestNanoVDB, GridBuilder_Vec4f)
         EXPECT_EQ(nanovdb::Vec4f(2.0f), srcAcc.getValue(nanovdb::Coord(-10, 20,-50)));
         EXPECT_EQ(nanovdb::Vec4f(3.0f), srcAcc.getValue(nanovdb::Coord( 50,-12, 30)));
 
-        auto handle = nanovdb::createNanoGrid(srcGrid, nanovdb::StatsMode::All);
+        auto handle = nanovdb::tools::createNanoGrid(srcGrid, nanovdb::tools::StatsMode::All);
         EXPECT_TRUE(handle);
         auto* meta = handle.gridMetaData();
         EXPECT_TRUE(meta);
@@ -3423,7 +3521,7 @@ TEST_F(TestNanoVDB, GridBuilder_Vec4f)
         EXPECT_EQ(uint32_t(NANOVDB_MINOR_VERSION_NUMBER), meta->version().getMinor());
         EXPECT_EQ(uint32_t(NANOVDB_PATCH_VERSION_NUMBER), meta->version().getPatch());
         EXPECT_EQ("", std::string(meta->shortGridName()));
-        EXPECT_EQ(nanovdb::mapToGridType<VoxelT>(), meta->gridType());
+        EXPECT_EQ(nanovdb::toGridType<VoxelT>(), meta->gridType());
         EXPECT_EQ(nanovdb::GridClass::Unknown, meta->gridClass());
         auto* dstGrid = handle.grid<VoxelT>();
         EXPECT_TRUE(dstGrid);
@@ -3467,7 +3565,7 @@ TEST_F(TestNanoVDB, GridBuilder_Fp4)
     using VoxelT = nanovdb::Fp4;
     EXPECT_EQ(96u + 512u/2, sizeof(nanovdb::NanoLeaf<VoxelT>));
     { // 3 grid point
-        using SrcGridT = nanovdb::build::Grid<VoxelT>;
+        using SrcGridT = nanovdb::tools::build::Grid<VoxelT>;
         SrcGridT srcGrid(0.0f);
         auto srcAcc = srcGrid.getAccessor();
         srcAcc.setValue(nanovdb::Coord(  1,  2,  3), 1.0f);
@@ -3479,7 +3577,7 @@ TEST_F(TestNanoVDB, GridBuilder_Fp4)
         EXPECT_EQ(2.0f, srcAcc.getValue(nanovdb::Coord(-10, 20,-50)));
         EXPECT_EQ(3.0f, srcAcc.getValue(nanovdb::Coord( 50,-12, 30)));
 
-        auto handle = nanovdb::createNanoGrid<SrcGridT, VoxelT>(srcGrid, nanovdb::StatsMode::All);
+        auto handle = nanovdb::tools::createNanoGrid<SrcGridT, VoxelT>(srcGrid, nanovdb::tools::StatsMode::All);
         EXPECT_TRUE(handle);
         auto* meta = handle.gridMetaData();
         EXPECT_TRUE(meta);
@@ -3488,7 +3586,7 @@ TEST_F(TestNanoVDB, GridBuilder_Fp4)
         EXPECT_EQ(uint32_t(NANOVDB_MINOR_VERSION_NUMBER), meta->version().getMinor());
         EXPECT_EQ(uint32_t(NANOVDB_PATCH_VERSION_NUMBER), meta->version().getPatch());
         EXPECT_EQ("", std::string(meta->shortGridName()));
-        EXPECT_EQ(nanovdb::mapToGridType<VoxelT>(), meta->gridType());
+        EXPECT_EQ(nanovdb::toGridType<VoxelT>(), meta->gridType());
         EXPECT_EQ(nanovdb::GridClass::Unknown, meta->gridClass());
         auto* dstGrid = handle.grid<VoxelT>();
         EXPECT_TRUE(dstGrid);
@@ -3532,20 +3630,20 @@ TEST_F(TestNanoVDB, GridBuilder_Fp4)
 
         auto mgrHandle = nanovdb::createNodeManager(*dstGrid);
         auto *nodeMgr = mgrHandle.mgr<VoxelT>();
-        EXPECT_TRUE(nanovdb::isValid(nodeMgr));
+        EXPECT_TRUE(nanovdb::isAligned(nodeMgr));
         EXPECT_TRUE(nodeMgr->isLinear());
         uint64_t n[3]={0};
         for (auto it2 = dstGrid->tree().root().cbeginChild(); it2; ++it2) {
             auto *node2 = &nodeMgr->upper(n[0]++);
-            EXPECT_TRUE(nanovdb::isValid(node2));
+            EXPECT_TRUE(nanovdb::isAligned(node2));
             EXPECT_EQ(&*it2, node2);
             for (auto it1 = it2->cbeginChild(); it1; ++it1) {
                 auto *node1 = &nodeMgr->lower(n[1]++);
-                EXPECT_TRUE(nanovdb::isValid(node1));
+                EXPECT_TRUE(nanovdb::isAligned(node1));
                 EXPECT_EQ(&*it1, node1);
                 for (auto it0 = it1->cbeginChild(); it0; ++it0) {
                     auto *node0 = &nodeMgr->leaf(n[2]++);
-                    EXPECT_TRUE(nanovdb::isValid(node0));
+                    EXPECT_TRUE(nanovdb::isAligned(node0));
                     EXPECT_EQ(&*it0, node0);
                 }// loop over child nodes of the lower internal node
             }// loop over child nodes of the upper internal node
@@ -3559,11 +3657,11 @@ TEST_F(TestNanoVDB, GridBuilder_Fp4)
         const nanovdb::Vec3d center(0), origin(0);
         const float tolerance = 0.5f * voxelSize;
 
-        auto handle = nanovdb::createLevelSetSphere<VoxelT>(radius, center,
+        auto handle = nanovdb::tools::createLevelSetSphere<VoxelT>(radius, center,
                                                             voxelSize, halfWidth,
                                                             origin, "sphere",
-                                                            nanovdb::StatsMode::Default,
-                                                            nanovdb::ChecksumMode::Default);
+                                                            nanovdb::tools::StatsMode::Default,
+                                                            nanovdb::CheckMode::Default);
         auto* nanoGrid = handle.grid<VoxelT>();
         EXPECT_TRUE(nanoGrid);
         Sphere<float> sphere(center, radius, voxelSize, halfWidth);
@@ -3574,11 +3672,11 @@ TEST_F(TestNanoVDB, GridBuilder_Fp4)
                 EXPECT_NEAR(nanoAcc.getValue(p), sphere(p), tolerance);
             }
         };
-        nanovdb::forEach(nanoGrid->indexBBox(), kernel);
+        nanovdb::util::forEach(nanoGrid->indexBBox(), kernel);
 
         nanovdb::io::writeGrid("data/sphere_fp4.nvdb", handle);
         ASSERT_THROW(nanovdb::io::readGrid("data/sphere_fp4.nvdb", 1), std::runtime_error);
-        //nanovdb::CpuTimer timer;
+        //nanovdb::util::Timer timer;
         //timer.start("read all grids");
         //handle = nanovdb::io::readGrid("data/sphere_fp4.nvdb");
         //timer.start("read first grid");
@@ -3586,7 +3684,7 @@ TEST_F(TestNanoVDB, GridBuilder_Fp4)
         //timer.stop();
         nanoGrid = handle.grid<VoxelT>();
         EXPECT_TRUE(nanoGrid);
-        nanovdb::forEach(nanoGrid->indexBBox(), kernel);
+        nanovdb::util::forEach(nanoGrid->indexBBox(), kernel);
 
         //timer.start("read first grid");
         //handle = nanovdb::io::readGrid("data/sphere_fp4.nvdb", 0);
@@ -3595,7 +3693,7 @@ TEST_F(TestNanoVDB, GridBuilder_Fp4)
         //timer.stop();
         nanoGrid = handle.grid<VoxelT>();
         EXPECT_TRUE(nanoGrid);
-        nanovdb::forEach(nanoGrid->indexBBox(), kernel);
+        nanovdb::util::forEach(nanoGrid->indexBBox(), kernel);
     }
 } // GridBuilder_Fp4
 
@@ -3604,7 +3702,7 @@ TEST_F(TestNanoVDB, GridBuilder_Fp8)
     using VoxelT = nanovdb::Fp8;
     EXPECT_EQ(96u + 512u, sizeof(nanovdb::NanoLeaf<VoxelT>));
     { // 3 grid point
-        using SrcGridT = nanovdb::build::Grid<VoxelT>;
+        using SrcGridT = nanovdb::tools::build::Grid<VoxelT>;
         SrcGridT srcGrid(0.0f);
         auto srcAcc = srcGrid.getAccessor();
 
@@ -3617,7 +3715,7 @@ TEST_F(TestNanoVDB, GridBuilder_Fp8)
         EXPECT_EQ(2.0f, srcAcc.getValue(nanovdb::Coord(-10, 20,-50)));
         EXPECT_EQ(3.0f, srcAcc.getValue(nanovdb::Coord( 50,-12, 30)));
 
-        auto handle = nanovdb::createNanoGrid<SrcGridT, VoxelT>(srcGrid, nanovdb::StatsMode::All);
+        auto handle = nanovdb::tools::createNanoGrid<SrcGridT, VoxelT>(srcGrid, nanovdb::tools::StatsMode::All);
         EXPECT_TRUE(handle);
         auto* meta = handle.gridMetaData();
         EXPECT_TRUE(meta);
@@ -3626,7 +3724,7 @@ TEST_F(TestNanoVDB, GridBuilder_Fp8)
         EXPECT_EQ(uint32_t(NANOVDB_MINOR_VERSION_NUMBER), meta->version().getMinor());
         EXPECT_EQ(uint32_t(NANOVDB_PATCH_VERSION_NUMBER), meta->version().getPatch());
         EXPECT_EQ("", std::string(meta->shortGridName()));
-        EXPECT_EQ(nanovdb::mapToGridType<VoxelT>(), meta->gridType());
+        EXPECT_EQ(nanovdb::toGridType<VoxelT>(), meta->gridType());
         EXPECT_EQ(nanovdb::GridClass::Unknown, meta->gridClass());
         auto* dstGrid = handle.grid<VoxelT>();
         EXPECT_TRUE(dstGrid);
@@ -3667,20 +3765,20 @@ TEST_F(TestNanoVDB, GridBuilder_Fp8)
 
         auto mgrHandle = nanovdb::createNodeManager(*dstGrid);
         auto *nodeMgr = mgrHandle.mgr<VoxelT>();
-        EXPECT_TRUE(nanovdb::isValid(nodeMgr));
+        EXPECT_TRUE(nanovdb::isAligned(nodeMgr));
         EXPECT_TRUE(nodeMgr->isLinear());
         uint64_t n[3]={0};
         for (auto it2 = dstGrid->tree().root().cbeginChild(); it2; ++it2) {
             auto *node2 = &nodeMgr->upper(n[0]++);
-            EXPECT_TRUE(nanovdb::isValid(node2));
+            EXPECT_TRUE(nanovdb::isAligned(node2));
             EXPECT_EQ(&*it2, node2);
             for (auto it1 = it2->cbeginChild(); it1; ++it1) {
                 auto *node1 = &nodeMgr->lower(n[1]++);
-                EXPECT_TRUE(nanovdb::isValid(node1));
+                EXPECT_TRUE(nanovdb::isAligned(node1));
                 EXPECT_EQ(&*it1, node1);
                 for (auto it0 = it1->cbeginChild(); it0; ++it0) {
                     auto *node0 = &nodeMgr->leaf(n[2]++);
-                    EXPECT_TRUE(nanovdb::isValid(node0));
+                    EXPECT_TRUE(nanovdb::isAligned(node0));
                     EXPECT_EQ(&*it0, node0);
                 }// loop over child nodes of the lower internal node
             }// loop over child nodes of the upper internal node
@@ -3694,11 +3792,11 @@ TEST_F(TestNanoVDB, GridBuilder_Fp8)
         const nanovdb::Vec3d center(0), origin(0);
         const float tolerance = 0.05f * voxelSize;
 
-        auto handle = nanovdb::createLevelSetSphere<VoxelT>(radius, center,
+        auto handle = nanovdb::tools::createLevelSetSphere<VoxelT>(radius, center,
                                                             voxelSize, halfWidth,
                                                             origin, "sphere",
-                                                            nanovdb::StatsMode::Default,
-                                                            nanovdb::ChecksumMode::Default);
+                                                            nanovdb::tools::StatsMode::Default,
+                                                            nanovdb::CheckMode::Default);
         auto* nanoGrid = handle.grid<VoxelT>();
         EXPECT_TRUE(nanoGrid);
         Sphere<float> sphere(center, radius, float(voxelSize), float(halfWidth));
@@ -3709,14 +3807,14 @@ TEST_F(TestNanoVDB, GridBuilder_Fp8)
                 EXPECT_NEAR(nanoAcc.getValue(p), sphere(p), tolerance);
             }
         };
-        nanovdb::forEach(nanoGrid->indexBBox(), kernel);
+        nanovdb::util::forEach(nanoGrid->indexBBox(), kernel);
 
         nanovdb::io::writeGrid("data/sphere_fp8.nvdb", handle);
         handle = nanovdb::io::readGrid("data/sphere_fp8.nvdb");
         nanoGrid = handle.grid<VoxelT>();
         EXPECT_TRUE(nanoGrid);
 
-        nanovdb::forEach(nanoGrid->indexBBox(), kernel);
+        nanovdb::util::forEach(nanoGrid->indexBBox(), kernel);
     }
 } // GridBuilder_Fp8
 
@@ -3725,7 +3823,7 @@ TEST_F(TestNanoVDB, GridBuilder_Fp16)
     using VoxelT = nanovdb::Fp16;
     EXPECT_EQ(96u + 512u*2, sizeof(nanovdb::NanoLeaf<VoxelT>));
     { // 3 grid point
-        using SrcGridT = nanovdb::build::Grid<VoxelT>;
+        using SrcGridT = nanovdb::tools::build::Grid<VoxelT>;
         SrcGridT srcGrid(0.0f);
         auto srcAcc = srcGrid.getAccessor();
         srcAcc.setValue(nanovdb::Coord(  1,  2,  3), 1.0f);
@@ -3737,7 +3835,7 @@ TEST_F(TestNanoVDB, GridBuilder_Fp16)
         EXPECT_EQ(2.0f, srcAcc.getValue(nanovdb::Coord(-10, 20,-50)));
         EXPECT_EQ(3.0f, srcAcc.getValue(nanovdb::Coord( 50,-12, 30)));
 
-        auto handle = nanovdb::createNanoGrid<SrcGridT, VoxelT>(srcGrid, nanovdb::StatsMode::All);
+        auto handle = nanovdb::tools::createNanoGrid<SrcGridT, VoxelT>(srcGrid, nanovdb::tools::StatsMode::All);
         EXPECT_TRUE(handle);
         auto* meta = handle.gridMetaData();
         EXPECT_TRUE(meta);
@@ -3746,7 +3844,7 @@ TEST_F(TestNanoVDB, GridBuilder_Fp16)
         EXPECT_EQ(uint32_t(NANOVDB_MINOR_VERSION_NUMBER), meta->version().getMinor());
         EXPECT_EQ(uint32_t(NANOVDB_PATCH_VERSION_NUMBER), meta->version().getPatch());
         EXPECT_EQ("", std::string(meta->shortGridName()));
-        EXPECT_EQ(nanovdb::mapToGridType<VoxelT>(), meta->gridType());
+        EXPECT_EQ(nanovdb::toGridType<VoxelT>(), meta->gridType());
         EXPECT_EQ(nanovdb::GridClass::Unknown, meta->gridClass());
         auto* dstGrid = handle.grid<VoxelT>();
         EXPECT_TRUE(dstGrid);
@@ -3787,20 +3885,20 @@ TEST_F(TestNanoVDB, GridBuilder_Fp16)
 
         auto mgrHandle = nanovdb::createNodeManager(*dstGrid);
         auto *nodeMgr = mgrHandle.mgr<VoxelT>();
-        EXPECT_TRUE(nanovdb::isValid(nodeMgr));
+        EXPECT_TRUE(nanovdb::isAligned(nodeMgr));
         EXPECT_TRUE(nodeMgr->isLinear());
         uint64_t n[3]={0};
         for (auto it2 = dstGrid->tree().root().cbeginChild(); it2; ++it2) {
             auto *node2 = &nodeMgr->upper(n[0]++);
-            EXPECT_TRUE(nanovdb::isValid(node2));
+            EXPECT_TRUE(nanovdb::isAligned(node2));
             EXPECT_EQ(&*it2, node2);
             for (auto it1 = it2->cbeginChild(); it1; ++it1) {
                 auto *node1 = &nodeMgr->lower(n[1]++);
-                EXPECT_TRUE(nanovdb::isValid(node1));
+                EXPECT_TRUE(nanovdb::isAligned(node1));
                 EXPECT_EQ(&*it1, node1);
                 for (auto it0 = it1->cbeginChild(); it0; ++it0) {
                     auto *node0 = &nodeMgr->leaf(n[2]++);
-                    EXPECT_TRUE(nanovdb::isValid(node0));
+                    EXPECT_TRUE(nanovdb::isAligned(node0));
                     EXPECT_EQ(&*it0, node0);
                 }// loop over child nodes of the lower internal node
             }// loop over child nodes of the upper internal node
@@ -3814,11 +3912,11 @@ TEST_F(TestNanoVDB, GridBuilder_Fp16)
         const nanovdb::Vec3d center(0), origin(0);
         const float tolerance = 0.005f * voxelSize;
 
-        auto handle = nanovdb::createLevelSetSphere<VoxelT>(radius, center,
+        auto handle = nanovdb::tools::createLevelSetSphere<VoxelT>(radius, center,
                                                             voxelSize, halfWidth,
                                                             origin, "sphere",
-                                                            nanovdb::StatsMode::Default,
-                                                            nanovdb::ChecksumMode::Default);
+                                                            nanovdb::tools::StatsMode::Default,
+                                                            nanovdb::CheckMode::Default);
         auto* nanoGrid = handle.grid<VoxelT>();
         EXPECT_TRUE(nanoGrid);
         Sphere<float> sphere(center, radius, float(voxelSize), float(halfWidth));
@@ -3829,14 +3927,14 @@ TEST_F(TestNanoVDB, GridBuilder_Fp16)
                 EXPECT_NEAR(nanoAcc.getValue(p), sphere(p), tolerance);
             }
         };
-        nanovdb::forEach(nanoGrid->indexBBox(), kernel);
+        nanovdb::util::forEach(nanoGrid->indexBBox(), kernel);
 
         nanovdb::io::writeGrid("data/sphere_fp16.nvdb", handle);
         handle = nanovdb::io::readGrid("data/sphere_fp16.nvdb");
         nanoGrid = handle.grid<VoxelT>();
         EXPECT_TRUE(nanoGrid);
 
-        nanovdb::forEach(nanoGrid->indexBBox(), kernel);
+        nanovdb::util::forEach(nanoGrid->indexBBox(), kernel);
     }
 } // GridBuilder_Fp16
 
@@ -3845,7 +3943,7 @@ TEST_F(TestNanoVDB, GridBuilder_FpN_Basic1)
     using VoxelT = nanovdb::FpN;
     EXPECT_EQ(96u, sizeof(nanovdb::NanoLeaf<VoxelT>));
     { // 1 grid point
-        using SrcGridT = nanovdb::build::Grid<VoxelT>;
+        using SrcGridT = nanovdb::tools::build::Grid<VoxelT>;
         SrcGridT srcGrid(0.0f);
         auto srcAcc = srcGrid.getAccessor();
         srcAcc.setValue(nanovdb::Coord(  0,  0,  0), 1.0f);
@@ -3853,7 +3951,7 @@ TEST_F(TestNanoVDB, GridBuilder_FpN_Basic1)
         EXPECT_TRUE(srcAcc.isValueOn(nanovdb::Coord(0, 0, 0)));
         EXPECT_EQ(1.0f, srcAcc.getValue(nanovdb::Coord(  0,  0,  0)));
 
-        auto handle = nanovdb::createNanoGrid<SrcGridT, VoxelT>(srcGrid, nanovdb::StatsMode::All);
+        auto handle = nanovdb::tools::createNanoGrid<SrcGridT, VoxelT>(srcGrid, nanovdb::tools::StatsMode::All);
         EXPECT_TRUE(handle);
         auto* meta = handle.gridMetaData();
         EXPECT_TRUE(meta);
@@ -3862,7 +3960,7 @@ TEST_F(TestNanoVDB, GridBuilder_FpN_Basic1)
         EXPECT_EQ(uint32_t(NANOVDB_MINOR_VERSION_NUMBER), meta->version().getMinor());
         EXPECT_EQ(uint32_t(NANOVDB_PATCH_VERSION_NUMBER), meta->version().getPatch());
         EXPECT_EQ("", std::string(meta->shortGridName()));
-        EXPECT_EQ(nanovdb::mapToGridType<VoxelT>(), meta->gridType());
+        EXPECT_EQ(nanovdb::toGridType<VoxelT>(), meta->gridType());
         EXPECT_EQ(nanovdb::GridClass::Unknown, meta->gridClass());
         auto* dstGrid = handle.grid<VoxelT>();
         EXPECT_TRUE(dstGrid);
@@ -3905,7 +4003,7 @@ TEST_F(TestNanoVDB, GridBuilder_FpN_Basic3)
     using VoxelT = nanovdb::FpN;
     EXPECT_EQ(96u, sizeof(nanovdb::NanoLeaf<VoxelT>));
     { // 3 grid point
-        using SrcGridT = nanovdb::build::Grid<VoxelT>;
+        using SrcGridT = nanovdb::tools::build::Grid<VoxelT>;
         SrcGridT srcGrid(0.0f);
         auto srcAcc = srcGrid.getAccessor();
         srcAcc.setValue(nanovdb::Coord(  1,  2,  3), 1.0f);
@@ -3917,7 +4015,7 @@ TEST_F(TestNanoVDB, GridBuilder_FpN_Basic3)
         EXPECT_EQ(2.0f, srcAcc.getValue(nanovdb::Coord(-10, 20,-50)));
         EXPECT_EQ(3.0f, srcAcc.getValue(nanovdb::Coord( 50,-12, 30)));
 
-        auto handle = nanovdb::createNanoGrid<SrcGridT, VoxelT>(srcGrid, nanovdb::StatsMode::All);
+        auto handle = nanovdb::tools::createNanoGrid<SrcGridT, VoxelT>(srcGrid, nanovdb::tools::StatsMode::All);
         EXPECT_TRUE(handle);
         auto* meta = handle.gridMetaData();
         EXPECT_TRUE(meta);
@@ -3926,7 +4024,7 @@ TEST_F(TestNanoVDB, GridBuilder_FpN_Basic3)
         EXPECT_EQ(uint32_t(NANOVDB_MINOR_VERSION_NUMBER), meta->version().getMinor());
         EXPECT_EQ(uint32_t(NANOVDB_PATCH_VERSION_NUMBER), meta->version().getPatch());
         EXPECT_EQ("", std::string(meta->shortGridName()));
-        EXPECT_EQ(nanovdb::mapToGridType<VoxelT>(), meta->gridType());
+        EXPECT_EQ(nanovdb::toGridType<VoxelT>(), meta->gridType());
         EXPECT_EQ(nanovdb::GridClass::Unknown, meta->gridClass());
         auto* dstGrid = handle.grid<VoxelT>();
         EXPECT_TRUE(dstGrid);
@@ -3967,20 +4065,20 @@ TEST_F(TestNanoVDB, GridBuilder_FpN_Basic3)
 
         auto mgrHandle = nanovdb::createNodeManager(*dstGrid);
         auto *nodeMgr = mgrHandle.mgr<VoxelT>();
-        EXPECT_TRUE(nanovdb::isValid(nodeMgr));
+        EXPECT_TRUE(nanovdb::isAligned(nodeMgr));
         EXPECT_FALSE(nodeMgr->isLinear());
         uint64_t n[3]={0};
         for (auto it2 = dstGrid->tree().root().cbeginChild(); it2; ++it2) {
             auto *node2 = &nodeMgr->upper(n[0]++);
-            EXPECT_TRUE(nanovdb::isValid(node2));
+            EXPECT_TRUE(nanovdb::isAligned(node2));
             EXPECT_EQ(&*it2, node2);
             for (auto it1 = it2->cbeginChild(); it1; ++it1) {
                 auto *node1 = &nodeMgr->lower(n[1]++);
-                EXPECT_TRUE(nanovdb::isValid(node1));
+                EXPECT_TRUE(nanovdb::isAligned(node1));
                 EXPECT_EQ(&*it1, node1);
                 for (auto it0 = it1->cbeginChild(); it0; ++it0) {
                     auto *node0 = &nodeMgr->leaf(n[2]++);
-                    EXPECT_TRUE(nanovdb::isValid(node0));
+                    EXPECT_TRUE(nanovdb::isAligned(node0));
                     EXPECT_EQ(&*it0, node0);
                 }// loop over child nodes of the lower internal node
             }// loop over child nodes of the upper internal node
@@ -4000,11 +4098,11 @@ TEST_F(TestNanoVDB, GridBuilder_FpN_Sphere)
         const nanovdb::Vec3d center(0), origin(0);
         const float tolerance = 0.5f * voxelSize;
 
-        auto handle = nanovdb::createLevelSetSphere<VoxelT>(radius, center,
+        auto handle = nanovdb::tools::createLevelSetSphere<VoxelT>(radius, center,
                                                             voxelSize, halfWidth,
                                                             origin, "sphere",
-                                                            nanovdb::StatsMode::Default,
-                                                            nanovdb::ChecksumMode::Default,
+                                                            nanovdb::tools::StatsMode::Default,
+                                                            nanovdb::CheckMode::Default,
                                                             tolerance,
                                                             false);
         auto* nanoGrid = handle.grid<VoxelT>();
@@ -4017,26 +4115,26 @@ TEST_F(TestNanoVDB, GridBuilder_FpN_Sphere)
                 EXPECT_NEAR(nanoAcc.getValue(p), sphere(p), tolerance);
             }
         };
-        nanovdb::forEach(nanoGrid->indexBBox(), kernel);
+        nanovdb::util::forEach(nanoGrid->indexBBox(), kernel);
 
         nanovdb::io::writeGrid("data/sphere_fpN.nvdb", handle);
         handle = nanovdb::io::readGrid("data/sphere_fpN.nvdb");
         nanoGrid = handle.grid<VoxelT>();
         EXPECT_TRUE(nanoGrid);
 
-        nanovdb::forEach(nanoGrid->indexBBox(), kernel);
+        nanovdb::util::forEach(nanoGrid->indexBBox(), kernel);
     }
 } // GridBuilder_FpN_Sphere
 
 TEST_F(TestNanoVDB, NodeManager)
 {
     { // 1 active voxel
-        using SrcGridT = nanovdb::build::Grid<float>;
+        using SrcGridT = nanovdb::tools::build::Grid<float>;
         SrcGridT srcGrid(0.0f, "test", nanovdb::GridClass::LevelSet);
         auto srcAcc = srcGrid.getAccessor();
         const nanovdb::Coord x0(1, 2, 3), x1(1, 2, 4);
         srcAcc.setValue(x1, 1.0f);
-        auto handle = nanovdb::createNanoGrid(srcGrid);
+        auto handle = nanovdb::tools::createNanoGrid(srcGrid);
         EXPECT_TRUE(handle);
         auto* dstGrid = handle.grid<float>();
         EXPECT_TRUE(dstGrid);
@@ -4052,8 +4150,7 @@ TEST_F(TestNanoVDB, NodeManager)
         auto *nodeMgr = nodeMgrHandle.mgr<float>();
         EXPECT_TRUE(nodeMgr);
         EXPECT_TRUE(nanovdb::isAligned(nodeMgr));
-        EXPECT_TRUE(nanovdb::isValid(nodeMgr));
-        EXPECT_FALSE(nanovdb::isValid(nullptr));
+        EXPECT_TRUE(nanovdb::isAligned(nodeMgr));
         EXPECT_TRUE(nanovdb::isAligned(nullptr));
         EXPECT_TRUE(nodeMgr->isLinear());
 
@@ -4085,15 +4182,15 @@ TEST_F(TestNanoVDB, NodeManager)
         uint64_t n[3]={0};
         for (auto it2 = dstGrid->tree().root().cbeginChild(); it2; ++it2) {
             auto *node2 = &nodeMgr->upper(n[0]++);
-            EXPECT_TRUE(nanovdb::isValid(node2));
+            EXPECT_TRUE(nanovdb::isAligned(node2));
             EXPECT_EQ(&*it2, node2);
             for (auto it1 = it2->cbeginChild(); it1; ++it1) {
                 auto *node1 = &nodeMgr->lower(n[1]++);
-                EXPECT_TRUE(nanovdb::isValid(node1));
+                EXPECT_TRUE(nanovdb::isAligned(node1));
                 EXPECT_EQ(&*it1, node1);
                 for (auto it0 = it1->cbeginChild(); it0; ++it0) {
                     auto *node0 = &nodeMgr->leaf(n[2]++);
-                    EXPECT_TRUE(nanovdb::isValid(node0));
+                    EXPECT_TRUE(nanovdb::isAligned(node0));
                     EXPECT_EQ(&*it0, node0);
                 }// loop over child nodes of the lower internal node
             }// loop over child nodes of the upper internal node
@@ -4103,13 +4200,13 @@ TEST_F(TestNanoVDB, NodeManager)
         EXPECT_EQ(dstGrid->tree().nodeCount(2), n[0]);
     }
     { // 2 active voxels
-        using SrcGridT = nanovdb::build::Grid<float>;
+        using SrcGridT = nanovdb::tools::build::Grid<float>;
         SrcGridT srcGrid(0.0f, "test", nanovdb::GridClass::LevelSet);
         auto srcAcc = srcGrid.getAccessor();
         const nanovdb::Coord x0(1, 2, 3), x1(2,-2, 9), x2(1, 2, 4);
         srcAcc.setValue(x1, 1.0f);
         srcAcc.setValue(x2, 2.0f);
-        auto handle = nanovdb::createNanoGrid(srcGrid);
+        auto handle = nanovdb::tools::createNanoGrid(srcGrid);
         EXPECT_TRUE(handle);
         auto* dstGrid = handle.grid<float>();
         EXPECT_TRUE(dstGrid);
@@ -4153,15 +4250,15 @@ TEST_F(TestNanoVDB, NodeManager)
         uint64_t n[3]={0};
         for (auto it2 = dstGrid->tree().root().cbeginChild(); it2; ++it2) {
             auto *node2 = &nodeMgr->upper(n[0]++);
-            EXPECT_TRUE(nanovdb::isValid(node2));
+            EXPECT_TRUE(nanovdb::isAligned(node2));
             EXPECT_EQ(&*it2, node2);
             for (auto it1 = it2->cbeginChild(); it1; ++it1) {
                 auto *node1 = &nodeMgr->lower(n[1]++);
-                EXPECT_TRUE(nanovdb::isValid(node1));
+                EXPECT_TRUE(nanovdb::isAligned(node1));
                 EXPECT_EQ(&*it1, node1);
                 for (auto it0 = it1->cbeginChild(); it0; ++it0) {
                     auto *node0 = &nodeMgr->leaf(n[2]++);
-                    EXPECT_TRUE(nanovdb::isValid(node0));
+                    EXPECT_TRUE(nanovdb::isAligned(node0));
                     EXPECT_EQ(&*it0, node0);
                 }// loop over child nodes of the lower internal node
             }// loop over child nodes of the upper internal node
@@ -4184,13 +4281,13 @@ TEST_F(TestNanoVDB, NodeManager)
             }
         }
         EXPECT_EQ(voxelCount, voxels.size());
-        using SrcGridT = nanovdb::build::Grid<float>;
+        using SrcGridT = nanovdb::tools::build::Grid<float>;
         SrcGridT srcGrid(-1.0f, "test", nanovdb::GridClass::LevelSet);
         auto srcAcc = srcGrid.getAccessor();
         for (size_t i=0; i<voxelCount; ++i) {
             srcAcc.setValue(voxels[i], float(i));
         }
-        auto handle = nanovdb::createNanoGrid(srcGrid);
+        auto handle = nanovdb::tools::createNanoGrid(srcGrid);
         EXPECT_TRUE(handle);
         const auto* dstGrid = handle.grid<float>();
         EXPECT_TRUE(dstGrid);
@@ -4212,15 +4309,15 @@ TEST_F(TestNanoVDB, NodeManager)
         uint64_t n[3]={0};
         for (auto it2 = dstGrid->tree().root().cbeginChild(); it2; ++it2) {
             auto *node2 = &nodeMgr->upper(n[0]++);
-            EXPECT_TRUE(nanovdb::isValid(node2));
+            EXPECT_TRUE(nanovdb::isAligned(node2));
             EXPECT_EQ(&*it2, node2);
             for (auto it1 = it2->cbeginChild(); it1; ++it1) {
                 auto *node1 = &nodeMgr->lower(n[1]++);
-                EXPECT_TRUE(nanovdb::isValid(node1));
+                EXPECT_TRUE(nanovdb::isAligned(node1));
                 EXPECT_EQ(&*it1, node1);
                 for (auto it0 = it1->cbeginChild(); it0; ++it0) {
                     auto *node0 = &nodeMgr->leaf(n[2]++);
-                    EXPECT_TRUE(nanovdb::isValid(node0));
+                    EXPECT_TRUE(nanovdb::isAligned(node0));
                     EXPECT_EQ(&*it0, node0);
                 }// loop over child nodes of the lower internal node
             }// loop over child nodes of the upper internal node
@@ -4234,7 +4331,7 @@ TEST_F(TestNanoVDB, NodeManager)
 TEST_F(TestNanoVDB, GridBuilderBasicDense)
 {
     { // dense functor
-        using SrcGridT = nanovdb::build::Grid<float>;
+        using SrcGridT = nanovdb::tools::build::Grid<float>;
         SrcGridT srcGrid(0.0f, "test", nanovdb::GridClass::LevelSet);
         const nanovdb::CoordBBox bbox(nanovdb::Coord(0), nanovdb::Coord(100));
         auto func = [](const nanovdb::Coord&) { return 1.0f; };
@@ -4244,7 +4341,7 @@ TEST_F(TestNanoVDB, GridBuilderBasicDense)
             EXPECT_EQ(1.0f, srcAcc.getValue(*ijk));
             EXPECT_TRUE(srcAcc.isActive(*ijk));
         }
-        auto handle = nanovdb::createNanoGrid(srcGrid);
+        auto handle = nanovdb::tools::createNanoGrid(srcGrid);
         EXPECT_TRUE(handle);
         auto* meta = handle.gridMetaData();
         EXPECT_TRUE(meta);
@@ -4284,7 +4381,7 @@ TEST_F(TestNanoVDB, GridBuilderBasicDense)
 TEST_F(TestNanoVDB, GridBuilderBackground)
 {
     {
-        using SrcGridT = nanovdb::build::Grid<float>;
+        using SrcGridT = nanovdb::tools::build::Grid<float>;
         SrcGridT srcGrid(0.5f);
         auto acc = srcGrid.getAccessor();
 
@@ -4297,7 +4394,7 @@ TEST_F(TestNanoVDB, GridBuilderBackground)
         EXPECT_TRUE(acc.isActive(nanovdb::Coord(1)));
         EXPECT_EQ(0, acc.getValue(nanovdb::Coord(2)));
         EXPECT_TRUE(acc.isActive(nanovdb::Coord(1)));
-        auto gridHdl = nanovdb::createNanoGrid(srcGrid);
+        auto gridHdl = nanovdb::tools::createNanoGrid(srcGrid);
         auto grid = gridHdl.grid<float>();
         EXPECT_TRUE(grid);
         EXPECT_FALSE(grid->isEmpty());
@@ -4309,7 +4406,7 @@ TEST_F(TestNanoVDB, GridBuilderBackground)
 
 TEST_F(TestNanoVDB, GridBuilderSphere)
 {
-    using SrcGridT = nanovdb::build::Grid<float>;
+    using SrcGridT = nanovdb::tools::build::Grid<float>;
     Sphere<float> sphere(nanovdb::Vec3d(50), 20.0f);
     EXPECT_EQ(3.0f, sphere.background());
     EXPECT_EQ(3.0f, sphere(nanovdb::Coord(100)));
@@ -4323,7 +4420,7 @@ TEST_F(TestNanoVDB, GridBuilderSphere)
     //mTimer.start("GridBulder Sphere");
     srcGrid(sphere, bbox);
     //mTimer.stop();
-    auto handle = nanovdb::createNanoGrid(srcGrid);
+    auto handle = nanovdb::tools::createNanoGrid(srcGrid);
     EXPECT_TRUE(handle);
     EXPECT_EQ(1u, handle.gridCount());
     auto* meta = handle.gridMetaData();
@@ -4381,7 +4478,7 @@ TEST_F(TestNanoVDB, createLevelSetSphere)
     EXPECT_EQ(-1.0f, sphere(nanovdb::Coord(center, center, center+radius-1)));
     EXPECT_EQ( 2.0f, sphere(nanovdb::Coord(center, center, center+radius+2)));
     //mTimer.start("createLevelSetSphere");
-    auto handle = nanovdb::createLevelSetSphere(radius, nanovdb::Vec3d(center),
+    auto handle = nanovdb::tools::createLevelSetSphere(radius, nanovdb::Vec3d(center),
                                                 voxelSize, width, nanovdb::Vec3d(0), gridName);
     //mTimer.stop();
     const nanovdb::CoordBBox bbox(nanovdb::Coord(center-radius-width-1),
@@ -4417,7 +4514,7 @@ TEST_F(TestNanoVDB, createLevelSetSphere)
     //std::cerr << "bbox.min = (" << dstGrid->indexBBox()[0][0] << ", " <<  dstGrid->indexBBox()[0][1] << ", " <<  dstGrid->indexBBox()[0][2] << ")" << std::endl;
     //std::cerr << "bbox.max = (" << dstGrid->indexBBox()[1][0] << ", " <<  dstGrid->indexBBox()[1][1] << ", " <<  dstGrid->indexBBox()[1][2] << ")" << std::endl;
     std::atomic<uint64_t> count{0};
-    nanovdb::forEach(bbox, [&](const nanovdb::CoordBBox &b){
+    nanovdb::util::forEach(bbox, [&](const nanovdb::CoordBBox &b){
         auto dstAcc = dstGrid->getAccessor();
         for (auto it = b.begin(); it; ++it) {
             const nanovdb::Coord ijk = *it;
@@ -4432,8 +4529,8 @@ TEST_F(TestNanoVDB, createLevelSetSphere)
 
 TEST_F(TestNanoVDB, createFogVolumeSphere)
 {
-    auto                     handle = nanovdb::createFogVolumeSphere(20.0f, nanovdb::Vec3d(50),
-                                                                     1.0, 3.0, nanovdb::Vec3d(0), "sphere_20");
+    auto handle = nanovdb::tools::createFogVolumeSphere(20.0f, nanovdb::Vec3d(50),
+                                                        1.0, 3.0, nanovdb::Vec3d(0), "sphere_20");
     const nanovdb::CoordBBox bbox(nanovdb::Coord(0), nanovdb::Coord(100));
 
     EXPECT_TRUE(handle);
@@ -4499,7 +4596,7 @@ TEST_F(TestNanoVDB, createPointSphere)
     EXPECT_EQ(-1.0f, sphere(nanovdb::Coord(0, 0, 99)));
     EXPECT_EQ(1.0f, sphere(nanovdb::Coord(0, 0, 101)));
 
-    auto handle = nanovdb::createPointSphere(1,// pointer per voxel
+    auto handle = nanovdb::tools::createPointSphere(1,// pointer per voxel
                                              100.0,// radius of sphere
                                              nanovdb::Vec3d(0),// center sphere
                                              1.0,// voxel size
@@ -4535,7 +4632,7 @@ TEST_F(TestNanoVDB, createPointSphere)
     for (nanovdb::Coord ijk = bbox[0]; ijk[0] <= bbox[1][0]; ++ijk[0]) {
         for (ijk[1] = bbox[0][1]; ijk[1] <= bbox[1][1]; ++ijk[1]) {
             for (ijk[2] = bbox[0][2]; ijk[2] <= bbox[1][2]; ++ijk[2]) {
-                if (nanovdb::Abs(sphere(ijk)) < 0.5f) {
+                if (nanovdb::math::Abs(sphere(ijk)) < 0.5f) {
                     ++count;
                     EXPECT_TRUE(acc.isActive(ijk));
                     EXPECT_TRUE(acc.getValue(ijk) != std::numeric_limits<uint32_t>::max());
@@ -4545,7 +4642,7 @@ TEST_F(TestNanoVDB, createPointSphere)
                     EXPECT_LT(begin, end);
                     EXPECT_EQ(1u, n); // exactly one point per voxel
                     const nanovdb::Vec3f p = *begin;// + ijk.asVec3s();// local voxel coordinate + global index coordinates
-                    EXPECT_TRUE(nanovdb::Abs(sphere(p)) <= 1.0f);
+                    EXPECT_TRUE(nanovdb::math::Abs(sphere(p)) <= 1.0f);
                 } else {
                     EXPECT_FALSE(acc.isActive(ijk));
                     EXPECT_TRUE(acc.getValue(ijk) < 512 || acc.getValue(ijk) == std::numeric_limits<uint32_t>::max());
@@ -4561,7 +4658,7 @@ TEST_F(TestNanoVDB, createPointSphere)
 
 TEST_F(TestNanoVDB, createLevelSetTorus)
 {
-    auto handle = nanovdb::createLevelSetTorus(100.0f, 50.0f, nanovdb::Vec3d(50),
+    auto handle = nanovdb::tools::createLevelSetTorus(100.0f, 50.0f, nanovdb::Vec3d(50),
                                                1.0, 3.0, nanovdb::Vec3d(0), "torus_100");
 
     EXPECT_TRUE(handle);
@@ -4598,7 +4695,7 @@ TEST_F(TestNanoVDB, createLevelSetTorus)
 
 TEST_F(TestNanoVDB, createFogVolumeTorus)
 {
-    auto handle = nanovdb::createFogVolumeTorus(100.0f, 50.0f, nanovdb::Vec3d(50),
+    auto handle = nanovdb::tools::createFogVolumeTorus(100.0f, 50.0f, nanovdb::Vec3d(50),
                                                 1.0, 3.0, nanovdb::Vec3d(0), "torus_100");
 
     EXPECT_TRUE(handle);
@@ -4639,7 +4736,7 @@ TEST_F(TestNanoVDB, createFogVolumeTorus)
 
 TEST_F(TestNanoVDB, createLevelSetBox)
 {
-    auto handle = nanovdb::createLevelSetBox<float>(40.0f, 60.0f, 80.0f, nanovdb::Vec3d(50),
+    auto handle = nanovdb::tools::createLevelSetBox<float>(40.0f, 60.0f, 80.0f, nanovdb::Vec3d(50),
                                                     1.0, 3.0, nanovdb::Vec3d(0), "box");
     EXPECT_TRUE(handle);
     EXPECT_EQ(1u, handle.gridCount());
@@ -4675,7 +4772,7 @@ TEST_F(TestNanoVDB, createLevelSetBox)
 
 TEST_F(TestNanoVDB, createFogVolumeBox)
 {
-    auto handle = nanovdb::createFogVolumeBox<float>(40.0f, 60.0f, 80.0f, nanovdb::Vec3d(50),
+    auto handle = nanovdb::tools::createFogVolumeBox<float>(40.0f, 60.0f, 80.0f, nanovdb::Vec3d(50),
                                                      1.0, 3.0, nanovdb::Vec3d(0), "box");
     EXPECT_TRUE(handle);
     EXPECT_EQ(1u, handle.gridCount());
@@ -4711,7 +4808,7 @@ TEST_F(TestNanoVDB, createFogVolumeBox)
 
 TEST_F(TestNanoVDB, createLevelSetOctahedron)
 {
-    auto handle = nanovdb::createLevelSetOctahedron<float>(100.0f, nanovdb::Vec3d(50),
+    auto handle = nanovdb::tools::createLevelSetOctahedron<float>(100.0f, nanovdb::Vec3d(50),
                                                            1.0f, 3.0f, nanovdb::Vec3d(0), "octahedron");
     EXPECT_TRUE(handle);
     EXPECT_EQ(1u, handle.gridCount());
@@ -4740,7 +4837,7 @@ TEST_F(TestNanoVDB, createLevelSetOctahedron)
     EXPECT_TRUE(dstAcc.isActive(nanovdb::Coord(100, 50, 50)));
     EXPECT_EQ(1.0f, dstAcc.getValue(nanovdb::Coord(101, 50, 50)));
     EXPECT_TRUE(dstAcc.isActive(nanovdb::Coord(101, 50, 50)));
-    EXPECT_EQ(-nanovdb::Sqrt(4.0f/3.0f), dstAcc.getValue(nanovdb::Coord(98, 50, 50)));
+    EXPECT_EQ(-nanovdb::math::Sqrt(4.0f/3.0f), dstAcc.getValue(nanovdb::Coord(98, 50, 50)));
     EXPECT_TRUE(dstAcc.isActive(nanovdb::Coord(98, 50, 50)));
 
 } // createLevelSetOctahedron
@@ -4774,7 +4871,7 @@ TEST_F(TestNanoVDB, CNanoVDBSize)
 #if !defined(DISABLE_PNANOVDB) && !defined(_MSC_VER)
 TEST_F(TestNanoVDB, PNanoVDB_Basic)
 {
-    EXPECT_EQ(NANOVDB_MAGIC_NUMBER, PNANOVDB_MAGIC_NUMBER);
+    EXPECT_EQ(NANOVDB_MAGIC_NUMB, PNANOVDB_MAGIC_NUMBER);
 
     EXPECT_EQ(NANOVDB_MAJOR_VERSION_NUMBER, PNANOVDB_MAJOR_VERSION_NUMBER);
     EXPECT_EQ(NANOVDB_MINOR_VERSION_NUMBER, PNANOVDB_MINOR_VERSION_NUMBER);
@@ -5015,8 +5112,10 @@ TYPED_TEST(TestOffsets, PNanoVDB)
         grid_type = PNANOVDB_GRID_TYPE_VEC3U8;
     } else if (std::is_same<nanovdb::Vec3u16, TypeParam>::value) {
         grid_type = PNANOVDB_GRID_TYPE_VEC3U16;
+    } else if (std::is_same<uint8_t, TypeParam>::value) {
+        grid_type = PNANOVDB_GRID_TYPE_UINT8;
     } else {
-        EXPECT_TRUE(!"your forgot to add a grid_type to TestOffsets::PNanoVDB!");
+        EXPECT_FALSE("your forgot to add a grid_type to TestOffsets::PNanoVDB!");
     }
     using nodeLeaf_t = typename nanovdb::LeafData<ValueType, nanovdb::Coord, nanovdb::Mask, 3>;
     using leaf_t = typename nanovdb::LeafNode<ValueType>;
@@ -5064,18 +5163,18 @@ TYPED_TEST(TestOffsets, PNanoVDB)
 
     // test GridBlindMetaData
     EXPECT_EQ((int)sizeof(nanovdb::GridBlindMetaData), PNANOVDB_GRIDBLINDMETADATA_SIZE);
-    EXPECT_EQ(NANOVDB_OFFSETOF(nanovdb::GridBlindMetaData, mDataOffset), PNANOVDB_GRIDBLINDMETADATA_OFF_BYTE_OFFSET);
-    EXPECT_EQ(NANOVDB_OFFSETOF(nanovdb::GridBlindMetaData, mValueCount), PNANOVDB_GRIDBLINDMETADATA_OFF_ELEMENT_COUNT);
-    EXPECT_EQ(NANOVDB_OFFSETOF(nanovdb::GridBlindMetaData, mValueSize), PNANOVDB_GRIDBLINDMETADATA_OFF_FLAGS);
+    EXPECT_EQ(NANOVDB_OFFSETOF(nanovdb::GridBlindMetaData, mDataOffset), PNANOVDB_GRIDBLINDMETADATA_OFF_DATA_OFFSET);
+    EXPECT_EQ(NANOVDB_OFFSETOF(nanovdb::GridBlindMetaData, mValueCount), PNANOVDB_GRIDBLINDMETADATA_OFF_VALUE_COUNT);
+    EXPECT_EQ(NANOVDB_OFFSETOF(nanovdb::GridBlindMetaData, mValueSize), PNANOVDB_GRIDBLINDMETADATA_OFF_VALUE_SIZE);
     EXPECT_EQ(NANOVDB_OFFSETOF(nanovdb::GridBlindMetaData, mSemantic), PNANOVDB_GRIDBLINDMETADATA_OFF_SEMANTIC);
     EXPECT_EQ(NANOVDB_OFFSETOF(nanovdb::GridBlindMetaData, mDataClass), PNANOVDB_GRIDBLINDMETADATA_OFF_DATA_CLASS);
     EXPECT_EQ(NANOVDB_OFFSETOF(nanovdb::GridBlindMetaData, mDataType), PNANOVDB_GRIDBLINDMETADATA_OFF_DATA_TYPE);
     EXPECT_EQ(NANOVDB_OFFSETOF(nanovdb::GridBlindMetaData, mName), PNANOVDB_GRIDBLINDMETADATA_OFF_NAME);
 
     EXPECT_EQ((int)sizeof(pnanovdb_gridblindmetadata_t), PNANOVDB_GRIDBLINDMETADATA_SIZE);
-    EXPECT_EQ(NANOVDB_OFFSETOF(pnanovdb_gridblindmetadata_t, byte_offset), PNANOVDB_GRIDBLINDMETADATA_OFF_BYTE_OFFSET);
-    EXPECT_EQ(NANOVDB_OFFSETOF(pnanovdb_gridblindmetadata_t, element_count), PNANOVDB_GRIDBLINDMETADATA_OFF_ELEMENT_COUNT);
-    EXPECT_EQ(NANOVDB_OFFSETOF(pnanovdb_gridblindmetadata_t, flags), PNANOVDB_GRIDBLINDMETADATA_OFF_FLAGS);
+    EXPECT_EQ(NANOVDB_OFFSETOF(pnanovdb_gridblindmetadata_t, data_offset), PNANOVDB_GRIDBLINDMETADATA_OFF_DATA_OFFSET);
+    EXPECT_EQ(NANOVDB_OFFSETOF(pnanovdb_gridblindmetadata_t, value_count), PNANOVDB_GRIDBLINDMETADATA_OFF_VALUE_COUNT);
+    EXPECT_EQ(NANOVDB_OFFSETOF(pnanovdb_gridblindmetadata_t, value_size), PNANOVDB_GRIDBLINDMETADATA_OFF_VALUE_SIZE);
     EXPECT_EQ(NANOVDB_OFFSETOF(pnanovdb_gridblindmetadata_t, semantic), PNANOVDB_GRIDBLINDMETADATA_OFF_SEMANTIC);
     EXPECT_EQ(NANOVDB_OFFSETOF(pnanovdb_gridblindmetadata_t, data_class), PNANOVDB_GRIDBLINDMETADATA_OFF_DATA_CLASS);
     EXPECT_EQ(NANOVDB_OFFSETOF(pnanovdb_gridblindmetadata_t, data_type), PNANOVDB_GRIDBLINDMETADATA_OFF_DATA_TYPE);
@@ -5193,12 +5292,12 @@ TEST_F(TestNanoVDB, GridStats)
 {
     using GridT = nanovdb::NanoGrid<float>;
     Sphere<float>               sphere(nanovdb::Vec3d(50), 50.0f);
-    nanovdb::build::Grid<float> grid(sphere.background(), "test", nanovdb::GridClass::LevelSet);
+    nanovdb::tools::build::Grid<float> grid(sphere.background(), "test", nanovdb::GridClass::LevelSet);
     const nanovdb::CoordBBox    bbox(nanovdb::Coord(-100), nanovdb::Coord(100));
     //mTimer.start("GridBuilder");
     grid(sphere, bbox);
     //mTimer.stop();
-    nanovdb::CreateNanoGrid<nanovdb::build::Grid<float>> converter(grid);
+    nanovdb::tools::CreateNanoGrid<nanovdb::tools::build::Grid<float>> converter(grid);
     auto handle1 = converter.getHandle<float>();
     auto handle2 = converter.getHandle<float>();
     EXPECT_TRUE(handle1);
@@ -5221,8 +5320,8 @@ TEST_F(TestNanoVDB, GridStats)
 
     { // reset stats in grid2
         //grid2->tree().data()->mVoxelCount = uint64_t(0);
-        grid2->data()->mWorldBBox = nanovdb::BBox<nanovdb::Vec3d>();
-        grid2->tree().root().data()->mBBox = nanovdb::BBox<nanovdb::Coord>();
+        grid2->data()->mWorldBBox = nanovdb::math::BBox<nanovdb::Vec3d>();
+        grid2->tree().root().data()->mBBox = nanovdb::math::BBox<nanovdb::Coord>();
         for (uint32_t i = 0; i < grid2->tree().nodeCount(0); ++i) {
             auto& leaf = mgr2->leaf(i);
             auto* data = leaf.data();
@@ -5281,7 +5380,7 @@ TEST_F(TestNanoVDB, GridStats)
     }
 
     //mTimer.start("GridStats");
-    nanovdb::gridStats(*grid2);
+    nanovdb::tools::updateGridStats(grid2);
     //mTimer.stop();
 
     { // check stats in grid2
@@ -5328,12 +5427,12 @@ TEST_F(TestNanoVDB, ScalarSampleFromVoxels)
     auto trilinearIndex = [&](const nanovdb::Coord& ijk) -> float {
         return 0.34f + 1.6f * dx * ijk[0] + 6.7f * dx * ijk[1] - 3.5f * dx * ijk[2]; // index coordinates
     };
-    using SrcGridT = nanovdb::build::Grid<float>;
+    using SrcGridT = nanovdb::tools::build::Grid<float>;
     SrcGridT srcGrid(1.0f);
     srcGrid.setTransform(dx);
     const nanovdb::CoordBBox bbox(nanovdb::Coord(0), nanovdb::Coord(128));
     srcGrid(trilinearIndex, bbox);
-    auto handle = nanovdb::createNanoGrid(srcGrid);
+    auto handle = nanovdb::tools::createNanoGrid(srcGrid);
     EXPECT_TRUE(handle);
     EXPECT_EQ(1u, handle.gridCount());
     auto* grid = handle.grid<float>();
@@ -5346,10 +5445,10 @@ TEST_F(TestNanoVDB, ScalarSampleFromVoxels)
     //std::cerr << "Trilinear: exact = " << exact << ", approx = " << approx << std::endl;
 
     auto acc = grid->getAccessor();
-    auto sampler0 = nanovdb::createSampler<0>(grid->tree());
-    auto sampler1 = nanovdb::createSampler<1>(acc);
-    auto sampler2 = nanovdb::createSampler<2>(acc);
-    auto sampler3 = nanovdb::createSampler<3>(acc);
+    auto sampler0 = nanovdb::math::createSampler<0>(grid->tree());
+    auto sampler1 = nanovdb::math::createSampler<1>(acc);
+    auto sampler2 = nanovdb::math::createSampler<2>(acc);
+    auto sampler3 = nanovdb::math::createSampler<3>(acc);
     //std::cerr << "0'th order: v = " << sampler0(xyz) << std::endl;
     EXPECT_EQ(approx, sampler0(xyz));
     EXPECT_NE(exact, sampler0(xyz));
@@ -5386,12 +5485,12 @@ TEST_F(TestNanoVDB, VectorSampleFromVoxels)
     auto trilinearIndex = [&](const nanovdb::Coord& ijk) -> nanovdb::Vec3f {
         return nanovdb::Vec3f(0.34f, 1.6f * dx * ijk[0] + 6.7f * dx * ijk[1], -3.5f * dx * ijk[2]); // index coordinates
     };
-    using SrcGridT = nanovdb::build::Grid<nanovdb::Vec3f>;
+    using SrcGridT = nanovdb::tools::build::Grid<nanovdb::Vec3f>;
     SrcGridT srcGrid(nanovdb::Vec3f(1.0f));
     const nanovdb::CoordBBox bbox(nanovdb::Coord(0), nanovdb::Coord(128));
     srcGrid(trilinearIndex, bbox);
     srcGrid.setTransform(dx);
-    auto handle = nanovdb::createNanoGrid(srcGrid);
+    auto handle = nanovdb::tools::createNanoGrid(srcGrid);
     EXPECT_TRUE(handle);
     EXPECT_EQ(1u, handle.gridCount());
     auto* grid = handle.grid<nanovdb::Vec3f>();
@@ -5403,69 +5502,84 @@ TEST_F(TestNanoVDB, VectorSampleFromVoxels)
     //std::cerr << "Trilinear: exact = " << exact << ", approx = " << approx << std::endl;
 
     auto acc = grid->getAccessor();
-    auto sampler0 = nanovdb::createSampler<0>(acc);
+    auto sampler0 = nanovdb::math::createSampler<0>(acc);
     //std::cerr << "0'th order: v = " << sampler0(ijk) << std::endl;
     EXPECT_EQ(approx, sampler0(ijk));
 
-    auto sampler1 = nanovdb::createSampler<1>(acc); // faster since it's using an accessor!!!
+    auto sampler1 = nanovdb::math::createSampler<1>(acc); // faster since it's using an accessor!!!
     //std::cerr << "1'th order: v = " << sampler1(ijk) << std::endl;
     for (int i = 0; i < 3; ++i)
         EXPECT_NEAR(exact[i], sampler1(ijk)[i], 1e-5);
     //EXPECT_FALSE(sampler1.zeroCrossing());// triggeres a static_assert error
     //EXPECT_FALSE(sampler1.gradient(grid->indexToWorld(ijk)));// triggeres a static_assert error
 
-    nanovdb::SampleFromVoxels<nanovdb::NanoTree<nanovdb::Vec3f>, 3> sampler3(grid->tree());
-    //auto sampler3 = nanovdb::createSampler<3>( acc );
+    nanovdb::math::SampleFromVoxels<nanovdb::NanoTree<nanovdb::Vec3f>, 3> sampler3(grid->tree());
+    //auto sampler3 = nanovdb::math::createSampler<3>( acc );
     //std::cerr << "3'rd order: v = " << sampler3(ijk) << std::endl;
     for (int i = 0; i < 3; ++i)
         EXPECT_NEAR(exact[i], sampler3(ijk)[i], 1e-5);
 
 } // VectorSampleFromVoxels
 
-TEST_F(TestNanoVDB, GridChecksum)
-{
-    EXPECT_TRUE(nanovdb::ChecksumMode::Disable < nanovdb::ChecksumMode::End);
-    EXPECT_TRUE(nanovdb::ChecksumMode::Partial < nanovdb::ChecksumMode::End);
-    EXPECT_TRUE(nanovdb::ChecksumMode::Full < nanovdb::ChecksumMode::End);
-    EXPECT_TRUE(nanovdb::ChecksumMode::Default < nanovdb::ChecksumMode::End);
-    EXPECT_NE(nanovdb::ChecksumMode::Disable, nanovdb::ChecksumMode::Partial);
-    EXPECT_NE(nanovdb::ChecksumMode::Disable, nanovdb::ChecksumMode::Full);
-    EXPECT_NE(nanovdb::ChecksumMode::Full, nanovdb::ChecksumMode::Partial);
-    EXPECT_NE(nanovdb::ChecksumMode::Default, nanovdb::ChecksumMode::Disable);
-    EXPECT_EQ(nanovdb::ChecksumMode::Default, nanovdb::ChecksumMode::Partial);
-    EXPECT_NE(nanovdb::ChecksumMode::Default, nanovdb::ChecksumMode::Full);
-
-    nanovdb::CpuTimer timer;
-    //timer.start("nanovdb::createLevelSetSphere");
-    auto handle = nanovdb::createLevelSetSphere(100.0f,
+TEST_F(TestNanoVDB, Checksum)
+{
+    EXPECT_LT(nanovdb::CheckMode::Disable, nanovdb::CheckMode::End);
+    EXPECT_LT(nanovdb::CheckMode::Partial, nanovdb::CheckMode::End);
+    EXPECT_LT(nanovdb::CheckMode::Full,    nanovdb::CheckMode::End);
+    EXPECT_LT(nanovdb::CheckMode::Default, nanovdb::CheckMode::End);
+    EXPECT_NE(nanovdb::CheckMode::Disable, nanovdb::CheckMode::Partial);
+    EXPECT_NE(nanovdb::CheckMode::Disable, nanovdb::CheckMode::Full);
+    EXPECT_NE(nanovdb::CheckMode::Full,    nanovdb::CheckMode::Partial);
+    EXPECT_NE(nanovdb::CheckMode::Default, nanovdb::CheckMode::Disable);
+    EXPECT_EQ(nanovdb::CheckMode::Default, nanovdb::CheckMode::Partial);
+    EXPECT_NE(nanovdb::CheckMode::Default, nanovdb::CheckMode::Full);
+
+    nanovdb::Checksum checksum1, checksum2, checksum3;
+    EXPECT_EQ(sizeof(checksum1), sizeof(uint64_t));
+    EXPECT_EQ(~uint64_t(0), checksum1.full());
+    EXPECT_EQ(checksum1.mode(), nanovdb::CheckMode::Disable);
+    EXPECT_EQ(nanovdb::toCheckMode(checksum1), nanovdb::CheckMode::Disable);
+    checksum1.head() = 0u;
+    EXPECT_EQ(checksum1.mode(), nanovdb::CheckMode::Partial);
+    checksum1.tail() = 0u;
+    EXPECT_EQ(checksum1.mode(), nanovdb::CheckMode::Full);
+    EXPECT_EQ( uint64_t(0), checksum1.full());
+    checksum1.disable();
+    EXPECT_EQ(~uint64_t(0), checksum1.full());
+    EXPECT_EQ(checksum1.mode(), nanovdb::CheckMode::Disable);
+    EXPECT_EQ(nanovdb::toCheckMode(checksum1), nanovdb::CheckMode::Disable);
+    EXPECT_EQ(checksum1, checksum3);
+
+    nanovdb::util::Timer timer;
+    //timer.start("nanovdb::tools::createLevelSetSphere");
+    auto handle = nanovdb::tools::createLevelSetSphere(100.0f,
                                                 nanovdb::Vec3d(50),
                                                 1.0,
                                                 3.0,
                                                 nanovdb::Vec3d(0),
                                                 "sphere_20",
-                                                nanovdb::StatsMode::Disable,
-                                                nanovdb::ChecksumMode::Disable);
+                                                nanovdb::tools::StatsMode::Disable,
+                                                nanovdb::CheckMode::Disable);
     //timer.stop();
     EXPECT_TRUE(handle);
     EXPECT_EQ(1u, handle.gridCount());
     auto* grid = handle.grid<float>();
     EXPECT_TRUE(grid);
 
-    nanovdb::GridChecksum checksum1, checksum2, checksum3;
-
-    EXPECT_EQ(checksum1, checksum3);
-
     //timer.start("Partial checksum");
-    checksum3(*grid, nanovdb::ChecksumMode::Partial);
+    checksum3 = nanovdb::tools::evalChecksum(grid, nanovdb::CheckMode::Partial);
+    //checksum3(*grid, nanovdb::CheckMode::Partial);
     //timer.stop();
 
     EXPECT_NE(checksum1, checksum3);
 
     //timer.start("Full checksum");
-    checksum1(*grid, nanovdb::ChecksumMode::Full);
+    checksum1 = nanovdb::tools::evalChecksum(grid, nanovdb::CheckMode::Full);
+    //checksum1(*grid, nanovdb::CheckMode::Full);
     //timer.stop();
 
-    checksum2(*grid, nanovdb::ChecksumMode::Full);
+    //checksum2(*grid, nanovdb::CheckMode::Full);
+    checksum2 = nanovdb::tools::evalChecksum(grid, nanovdb::CheckMode::Full);
 
     EXPECT_EQ(checksum1, checksum2);
 
@@ -5474,72 +5588,84 @@ TEST_F(TestNanoVDB, GridChecksum)
 
     leaf->data()->mValues[0] += 0.00001f; // slightly modify a single voxel value
 
-    checksum2(*grid, nanovdb::ChecksumMode::Full);
+    checksum2 = nanovdb::tools::evalChecksum(grid, nanovdb::CheckMode::Full);
+    //checksum2(*grid, nanovdb::CheckMode::Full);
     EXPECT_NE(checksum1, checksum2);
 
     leaf->data()->mValues[0] -= 0.00001f; // change back the single voxel value to it's original value
 
-    checksum2(*grid, nanovdb::ChecksumMode::Full);
+    checksum2 = nanovdb::tools::evalChecksum(grid, nanovdb::CheckMode::Full);
+    //checksum2(*grid, nanovdb::CheckMode::Full);
     EXPECT_EQ(checksum1, checksum2);
 
     leaf->data()->mValueMask.toggle(0); // change a single bit in a value mask
 
-    checksum2(*grid, nanovdb::ChecksumMode::Full);
+    checksum2 = nanovdb::tools::evalChecksum(grid, nanovdb::CheckMode::Full);
+    //checksum2(*grid, nanovdb::CheckMode::Full);
     EXPECT_NE(checksum1, checksum2);
 
     //timer.start("Incomplete checksum");
-    checksum2(*grid, nanovdb::ChecksumMode::Partial);
+    checksum2 = nanovdb::tools::evalChecksum(grid, nanovdb::CheckMode::Partial);
+    //checksum2(*grid, nanovdb::CheckMode::Partial);
     //timer.stop();
     EXPECT_EQ(checksum2, checksum3);
 } // GridChecksum
 
 TEST_F(TestNanoVDB, GridValidator)
 {
-    nanovdb::CpuTimer timer;
-    //timer.start("nanovdb::createLevelSetSphere");
-    auto handle = nanovdb::createLevelSetSphere(100.0f,
+    nanovdb::util::Timer timer;
+    //timer.start("nanovdb::tools::createLevelSetSphere");
+    auto handle = nanovdb::tools::createLevelSetSphere(100.0f,
                                                 nanovdb::Vec3d(50),
                                                 1.0, 3.0,
                                                 nanovdb::Vec3d(0),
                                                 "sphere_20",
-                                                nanovdb::StatsMode::All,
-                                                nanovdb::ChecksumMode::Full);
+                                                nanovdb::tools::StatsMode::All,
+                                                nanovdb::CheckMode::Full);
     //timer.stop();
     EXPECT_TRUE(handle);
     EXPECT_EQ(1u, handle.gridCount());
     auto* grid = handle.grid<float>();
     EXPECT_TRUE(grid);
 
+    {
+        auto mode = nanovdb::toCheckMode(grid->mChecksum);
+        EXPECT_EQ(nanovdb::CheckMode::Full, mode);
+        EXPECT_EQ(nanovdb::CheckMode::Full, grid->mChecksum.mode());
+        char str[30];
+        EXPECT_TRUE(nanovdb::util::streq(nanovdb::toStr(str, mode), "full"));
+    }
+
     //timer.start("isValid - not detailed");
-    EXPECT_TRUE(nanovdb::isValid(*grid, false, true));
+    EXPECT_TRUE(nanovdb::tools::isValid(grid, nanovdb::CheckMode::Partial, true));
     //timer.stop();
 
     //timer.start("isValid - detailed");
-    EXPECT_TRUE(nanovdb::isValid(*grid, true, true));
+    EXPECT_TRUE(nanovdb::tools::isValid(grid, nanovdb::CheckMode::Full, true));
     //timer.stop();
 
     //timer.start("Full checksum");
-    auto fastChecksum = nanovdb::checksum(*grid, nanovdb::ChecksumMode::Full);
+    auto fastChecksum = nanovdb::tools::evalChecksum(grid, nanovdb::CheckMode::Full);
     //timer.stop();
-    EXPECT_EQ(fastChecksum, nanovdb::checksum(*grid, nanovdb::ChecksumMode::Full));
+    EXPECT_EQ(fastChecksum, nanovdb::tools::evalChecksum(grid, nanovdb::CheckMode::Full));
 
     //auto mgr = nanovdb::createLeafMg
     auto* leaf = grid->tree().getFirstLeaf();
 
     leaf->data()->mValues[0] += 0.00001f; // slightly modify a single voxel value
 
-    EXPECT_NE(fastChecksum, nanovdb::checksum(*grid, nanovdb::ChecksumMode::Full));
-    EXPECT_FALSE(nanovdb::isValid(*grid, true, false));
+    EXPECT_NE(fastChecksum, nanovdb::tools::evalChecksum(grid, nanovdb::CheckMode::Full));
+    EXPECT_FALSE(nanovdb::tools::isValid(grid, nanovdb::CheckMode::Full, false));
 
     leaf->data()->mValues[0] -= 0.00001f; // change back the single voxel value to it's original value
 
-    EXPECT_EQ(fastChecksum, nanovdb::checksum(*grid, nanovdb::ChecksumMode::Full));
-    EXPECT_TRUE(nanovdb::isValid(*grid, true, true));
+    EXPECT_EQ(fastChecksum, nanovdb::tools::evalChecksum(grid, nanovdb::CheckMode::Full));
+    EXPECT_TRUE(nanovdb::tools::isValid(grid, nanovdb::CheckMode::Full, true));
 
     leaf->data()->mValueMask.toggle(0); // change a singel bit in a value mask
 
-    EXPECT_NE(fastChecksum, nanovdb::checksum(*grid, nanovdb::ChecksumMode::Full));
-    EXPECT_FALSE(nanovdb::isValid(*grid, true, false));
+    EXPECT_NE(fastChecksum, nanovdb::tools::evalChecksum(grid, nanovdb::CheckMode::Full));
+    EXPECT_FALSE(nanovdb::tools::isValid(grid, nanovdb::CheckMode::Full, false));
 } // GridValidator
 
 TEST_F(TestNanoVDB, RandomReadAccessor)
@@ -5548,7 +5674,7 @@ TEST_F(TestNanoVDB, RandomReadAccessor)
     const int voxelCount = 512, min = -10000, max = 10000;
     std::srand(98765);
     auto op = [&](){return rand() % (max - min) + min;};
-    using SrcGridT = nanovdb::build::Grid<float>;
+    using SrcGridT = nanovdb::tools::build::Grid<float>;
     for (int i=0; i<10; ++i) {
         SrcGridT srcGrid(background);
         auto acc = srcGrid.getAccessor();
@@ -5560,7 +5686,7 @@ TEST_F(TestNanoVDB, RandomReadAccessor)
             ijk[2] = op();
             acc.setValue(ijk, 1.0f*j);
         }
-        auto gridHdl = nanovdb::createNanoGrid(srcGrid);
+        auto gridHdl = nanovdb::tools::createNanoGrid(srcGrid);
         EXPECT_TRUE(gridHdl);
         EXPECT_EQ(1u, gridHdl.gridCount());
         auto grid = gridHdl.grid<float>();
@@ -5610,7 +5736,7 @@ TEST_F(TestNanoVDB, RandomReadAccessor)
 TEST_F(TestNanoVDB, StandardDeviation)
 {
     using OpT = nanovdb::GetNodeInfo<float>;
-    using SrcGridT = nanovdb::build::Grid<float>;
+    using SrcGridT = nanovdb::tools::build::Grid<float>;
     SrcGridT srcGrid(0.5f);
 
     {
@@ -5620,11 +5746,11 @@ TEST_F(TestNanoVDB, StandardDeviation)
         acc.setValue(nanovdb::Coord(1), 3.0f);
         acc.setValue(nanovdb::Coord(2), 0.0f);
     }
-    auto gridHdl = nanovdb::createNanoGrid(srcGrid);
+    auto gridHdl = nanovdb::tools::createNanoGrid(srcGrid);
     EXPECT_TRUE(gridHdl);
     auto grid = gridHdl.grid<float>();
     EXPECT_TRUE(grid);
-    nanovdb::gridStats(*grid);
+    nanovdb::tools::updateGridStats(grid);
 
     auto acc  = grid->tree().getAccessor();
     {
@@ -5682,13 +5808,13 @@ TEST_F(TestNanoVDB, BoxStencil)
     const float a = 0.54f, b[3]={0.12f, 0.78f,-0.34f};
     const nanovdb::Coord min(-17, -10, -8), max(10, 21, 13);
     const nanovdb::CoordBBox bbox(min, max), bbox2(min, max.offsetBy(-1));
-    using SrcGridT = nanovdb::build::Grid<float>;
+    using SrcGridT = nanovdb::tools::build::Grid<float>;
     SrcGridT srcGrid(0.0f);
     auto func = [&](const nanovdb::Coord &ijk) {
         return a + b[0]*ijk[0] + b[1]*ijk[1] + b[2]*ijk[2];
     };
     srcGrid(func, bbox);
-    auto handle = nanovdb::createNanoGrid(srcGrid);
+    auto handle = nanovdb::tools::createNanoGrid(srcGrid);
     EXPECT_TRUE(handle);
     EXPECT_EQ(1u, handle.gridCount());
     auto* grid = handle.grid<float>();
@@ -5700,7 +5826,7 @@ TEST_F(TestNanoVDB, BoxStencil)
     auto func2 = [&](const nanovdb::Vec3f &xyz) {
         return a + b[0]*xyz[0] + b[1]*xyz[1] + b[2]*xyz[2];
     };
-    nanovdb::BoxStencil<nanovdb::FloatGrid> s(*grid);
+    nanovdb::math::BoxStencil<nanovdb::FloatGrid> s(*grid);
     for (auto it = bbox2.begin(); it; ++it) {
         const nanovdb::Coord p = *it;
         s.moveTo(p);
@@ -5718,7 +5844,7 @@ TEST_F(TestNanoVDB, CurvatureStencil)
     {// test of level set to sphere at (6,8,10) with R=10 and dx=0.5
         const float radius = 10.0f;
         const nanovdb::Vec3d center(6.0, 8.0, 10.0);//i.e. (12,16,20) in index space
-        auto handle = nanovdb::createLevelSetSphere(radius,
+        auto handle = nanovdb::tools::createLevelSetSphere(radius,
                                                     center,
                                                     0.5, // dx
                                                     20.0); // half-width so dense inside
@@ -5728,7 +5854,7 @@ TEST_F(TestNanoVDB, CurvatureStencil)
         auto* grid = handle.grid<float>();
         EXPECT_TRUE(grid);
 
-        nanovdb::CurvatureStencil<nanovdb::FloatGrid> cs(*grid);
+        nanovdb::math::CurvatureStencil<nanovdb::FloatGrid> cs(*grid);
         nanovdb::Coord xyz(20,16,20);//i.e. 8 voxel or 4 world units away from the center
         cs.moveTo(xyz);
 
@@ -5783,14 +5909,14 @@ TEST_F(TestNanoVDB, CurvatureStencil)
       // sparse level set sphere
       nanovdb::Vec3d C(0.35f, 0.35f, 0.35f);
       double r = 0.15, voxelSize = 1.0/(dim-1);
-      auto handle = nanovdb::createLevelSetSphere(r, C, voxelSize);
+      auto handle = nanovdb::tools::createLevelSetSphere(r, C, voxelSize);
       EXPECT_TRUE(handle);
       EXPECT_EQ(1u, handle.gridCount());
       auto* sphere = handle.grid<float>();
       EXPECT_TRUE(sphere);
 
-      nanovdb::CurvatureStencil<nanovdb::FloatGrid> cs(*sphere);
-      const auto ijk = nanovdb::RoundDown<nanovdb::Coord>(sphere->worldToIndex(nanovdb::Vec3d(0.35, 0.35, 0.35 + 0.15)));
+      nanovdb::math::CurvatureStencil<nanovdb::FloatGrid> cs(*sphere);
+      const auto ijk = nanovdb::math::RoundDown<nanovdb::Coord>(sphere->worldToIndex(nanovdb::Vec3d(0.35, 0.35, 0.35 + 0.15)));
       const nanovdb::Vec3d tmp(ijk[0],ijk[1],ijk[2]);
       const double radius = (sphere->indexToWorld(tmp)-nanovdb::Vec3d(0.35)).length();
       //std::cerr << "\rRadius = " << radius << std::endl;
@@ -5858,7 +5984,7 @@ TEST_F(TestNanoVDB, GradStencil)
     {// test of level set to sphere at (6,8,10) with R=10 and dx=0.5
         const float radius = 10.0f;// 20 voxels
         const nanovdb::Vec3d center(6.0, 8.0, 10.0);//i.e. (12,16,20) in index space
-        auto handle = nanovdb::createLevelSetSphere(radius,
+        auto handle = nanovdb::tools::createLevelSetSphere(radius,
                                                     center,
                                                     0.5, // dx
                                                     20.0);// width, so dense inside
@@ -5869,7 +5995,7 @@ TEST_F(TestNanoVDB, GradStencil)
         EXPECT_TRUE(grid);
         EXPECT_EQ(0.5f, grid->voxelSize()[0]);
 
-        nanovdb::GradStencil<nanovdb::FloatGrid> cs(*grid);
+        nanovdb::math::GradStencil<nanovdb::FloatGrid> cs(*grid);
 
         nanovdb::Coord ijk(12, 16, 20);// on the surface in the +x direction
         const nanovdb::Vec3d xyz(ijk[0], ijk[1], ijk[2]);
@@ -5906,7 +6032,7 @@ TEST_F(TestNanoVDB, WenoStencil)
     {// test of level set to sphere at (6,8,10) with R=10 and dx=0.5
         const float radius = 10.0f;// 20 voxels
         const nanovdb::Vec3d center(6.0, 8.0, 10.0);//i.e. (12,16,20) in index space
-        auto handle = nanovdb::createLevelSetSphere(radius,
+        auto handle = nanovdb::tools::createLevelSetSphere(radius,
                                                     center,
                                                     0.5, // dx
                                                     20.0);// width, so dense inside
@@ -5917,7 +6043,7 @@ TEST_F(TestNanoVDB, WenoStencil)
         EXPECT_TRUE(grid);
         EXPECT_EQ(0.5f, grid->voxelSize()[0]);
 
-        nanovdb::WenoStencil<nanovdb::FloatGrid> cs(*grid);
+        nanovdb::math::WenoStencil<nanovdb::FloatGrid> cs(*grid);
 
         nanovdb::Coord ijk(12, 16, 20);// on the surface in the +x direction
         const nanovdb::Vec3d xyz(ijk[0], ijk[1], ijk[2]);
@@ -5951,7 +6077,7 @@ TEST_F(TestNanoVDB, WenoStencil)
 
 TEST_F(TestNanoVDB, StencilIntersection)
 {
-  using SrcGridT = nanovdb::build::Grid<float>;
+  using SrcGridT = nanovdb::tools::build::Grid<float>;
   const nanovdb::Coord ijk(1,4,-9);
   SrcGridT srcGrid(0.0f);
   auto acc = srcGrid.getAccessor();
@@ -5971,12 +6097,12 @@ TEST_F(TestNanoVDB, StencilIntersection)
             for (int pz=0; pz<2; ++pz) {
               acc.setValue(ijk.offsetBy(0,0,1), pz ? 1.0f : -1.0f);
               ++cases;
-              auto handle = nanovdb::createNanoGrid(srcGrid);
+              auto handle = nanovdb::tools::createNanoGrid(srcGrid);
               EXPECT_TRUE(handle);
               auto grid = handle.grid<float>();
               EXPECT_TRUE(grid);
               EXPECT_EQ(7, int(grid->activeVoxelCount()));
-              nanovdb::GradStencil<nanovdb::FloatGrid> stencil(*grid);
+              nanovdb::math::GradStencil<nanovdb::FloatGrid> stencil(*grid);
               stencil.moveTo(ijk);
               const int count = mx + px + my + py + mz + pz;// number of intersections
               EXPECT_TRUE(stencil.intersects() == (count > 0));
@@ -6009,39 +6135,45 @@ TEST_F(TestNanoVDB, MultiFile)
     }
     std::vector<nanovdb::GridHandle<>> handles;
     { // add an int32_t grid
-        nanovdb::build::Grid<int> grid(-1, "Int32 grid");
+        nanovdb::tools::build::Grid<int> grid(-1, "Int32 grid");
         auto acc = grid.getAccessor();
         acc.setValue(nanovdb::Coord(-256), 10);
-        handles.push_back(nanovdb::createNanoGrid(grid));
+        handles.push_back(nanovdb::tools::createNanoGrid(grid));
     }
     { // add an empty int32_t grid
-        nanovdb::build::Grid<int> grid(-4, "Int32 grid, empty");
-        handles.push_back(nanovdb::createNanoGrid(grid));
+        nanovdb::tools::build::Grid<int> grid(-4, "Int32 grid, empty");
+        handles.push_back(nanovdb::tools::createNanoGrid(grid));
     }
     { // add a Vec3f grid
-        nanovdb::build::Grid<nanovdb::Vec3f> grid(nanovdb::Vec3f(0.0f, 0.0f, -1.0f),"Float vector grid",nanovdb::GridClass::Staggered);
+        nanovdb::tools::build::Grid<nanovdb::Vec3f> grid(nanovdb::Vec3f(0.0f, 0.0f, -1.0f),"Float vector grid",nanovdb::GridClass::Staggered);
         auto acc = grid.getAccessor();
         acc.setValue(nanovdb::Coord(-256), nanovdb::Vec3f(1.0f, 0.0f, 0.0f));
-        handles.push_back(nanovdb::createNanoGrid(grid));
+        handles.push_back(nanovdb::tools::createNanoGrid(grid));
     }
     { // add an int64_t grid
-        nanovdb::build::Grid<int64_t> grid(0, "Int64 grid");
+        nanovdb::tools::build::Grid<int64_t> grid(0, "Int64 grid");
         auto acc = grid.getAccessor();
         acc.setValue(nanovdb::Coord(0), 10);
-        handles.push_back(nanovdb::createNanoGrid(grid));
+        handles.push_back(nanovdb::tools::createNanoGrid(grid));
+    }
+    { // add an uint8_t grid
+        nanovdb::tools::build::Grid<uint8_t> grid(0, "UInt8 grid");
+        auto acc = grid.getAccessor();
+        acc.setValue(nanovdb::Coord(0), 8u);
+        handles.push_back(nanovdb::tools::createNanoGrid(grid));
     }
     for (int i = 0; i < 10; ++i) {
         const float          radius = 100.0f;
         const float          voxelSize = 1.0f, width = 3.0f;
         const nanovdb::Vec3d center(i * 10.0f, 0.0f, 0.0f);
-        handles.push_back(nanovdb::createLevelSetSphere(radius, center, voxelSize, width,
+        handles.push_back(nanovdb::tools::createLevelSetSphere(radius, center, voxelSize, width,
                           nanovdb::Vec3d(0), "Level set sphere at (" + std::to_string(i * 10) + ",0,0)"));
     }
     { // add a double grid
-        nanovdb::build::Grid<double> grid(0.0, "Double grid", nanovdb::GridClass::FogVolume);
+        nanovdb::tools::build::Grid<double> grid(0.0, "Double grid", nanovdb::GridClass::FogVolume);
         auto acc = grid.getAccessor();
         acc.setValue(nanovdb::Coord(6000), 1.0);
-        handles.push_back(nanovdb::createNanoGrid(grid));
+        handles.push_back(nanovdb::tools::createNanoGrid(grid));
     }
 #if defined(NANOVDB_USE_BLOSC)
     nanovdb::io::writeGrids<nanovdb::HostBuffer, std::vector>("data/multi1.nvdb", handles, nanovdb::io::Codec::BLOSC);
@@ -6054,14 +6186,14 @@ TEST_F(TestNanoVDB, MultiFile)
         //mTimer.start("nanovdb::io::readGridMetaData");
         auto meta = nanovdb::io::readGridMetaData("data/multi1.nvdb");
         //mTimer.stop();
-        EXPECT_EQ(15u, meta.size());
+        EXPECT_EQ(16u, meta.size());
         EXPECT_EQ(std::string("Double grid"), meta.back().gridName);
     }
     { // read in32 grid and test values
         //mTimer.start("Reading multiple grids from file");
         auto handles = nanovdb::io::readGrids("data/multi1.nvdb");
         //mTimer.stop();
-        EXPECT_EQ(15u, handles.size());
+        EXPECT_EQ(16u, handles.size());
         auto& handle = handles.front();
         EXPECT_EQ(1u, handle.gridCount());
         EXPECT_EQ(std::string("Int32 grid"), handle.gridMetaData()->shortGridName());
@@ -6107,7 +6239,7 @@ TEST_F(TestNanoVDB, MultiFile)
         //mTimer.start("Reading multiple grids from file");
         auto handles = nanovdb::io::readGrids("data/multi1.nvdb");
         //mTimer.stop();
-        EXPECT_EQ(15u, handles.size());
+        EXPECT_EQ(16u, handles.size());
         auto& handle = handles[1];
         EXPECT_TRUE(handle);
         EXPECT_EQ(1u, handle.gridCount());
@@ -6145,7 +6277,7 @@ TEST_F(TestNanoVDB, MultiFile)
         //mTimer.start("Reading multiple grids from file");
         auto handles = nanovdb::io::readGrids("data/multi1.nvdb");
         //mTimer.stop();
-        EXPECT_EQ(15u, handles.size());
+        EXPECT_EQ(16u, handles.size());
         auto& handle = handles[3];
         EXPECT_EQ(1u, handle.gridCount());
         EXPECT_TRUE(handle);
@@ -6165,9 +6297,35 @@ TEST_F(TestNanoVDB, MultiFile)
         EXPECT_TRUE(grid->isUnknown());
         EXPECT_FALSE(grid->isStaggered());
     }
+    /*
+    { // read uint8 grid and test values
+        //mTimer.start("Reading multiple grids from file");
+        auto handles = nanovdb::io::readGrids("data/multi1.nvdb");
+        //mTimer.stop();
+        EXPECT_EQ(16u, handles.size());
+        auto& handle = handles[4];
+        EXPECT_EQ(1u, handle.gridCount());
+        EXPECT_TRUE(handle);
+        EXPECT_EQ(std::string("UInt8 grid"), handle.gridMetaData()->shortGridName());
+        auto* grid = handle.grid<uint8_t>();
+        EXPECT_TRUE(grid);
+        EXPECT_EQ(handle.gridMetaData()->indexBBox(), grid->indexBBox());
+        EXPECT_EQ(1u, grid->activeVoxelCount());
+        const nanovdb::Coord ijk(0);
+        EXPECT_EQ(8u, grid->tree().getValue(ijk));
+        EXPECT_EQ(0, grid->tree().getValue(ijk + nanovdb::Coord(1, 0, 0)));
+        EXPECT_EQ(8u, grid->tree().root().minimum());
+        EXPECT_EQ(8u, grid->tree().root().maximum());
+        EXPECT_EQ(nanovdb::CoordBBox(ijk, ijk), grid->indexBBox());
+        EXPECT_FALSE(grid->isLevelSet());
+        EXPECT_FALSE(grid->isFogVolume());
+        EXPECT_TRUE(grid->isUnknown());
+        EXPECT_FALSE(grid->isStaggered());
+    }
+    */
     { // read vec3f grid and test values
         auto handles = nanovdb::io::readGrids("data/multi1.nvdb");
-        EXPECT_EQ(15u, handles.size());
+        EXPECT_EQ(16u, handles.size());
         auto& handle = handles[2];
         EXPECT_TRUE(handle);
         EXPECT_EQ(1u, handle.gridCount());
@@ -6189,7 +6347,7 @@ TEST_F(TestNanoVDB, MultiFile)
     }
     { // read double grid and test values
         auto handles = nanovdb::io::readGrids("data/multi1.nvdb");
-        EXPECT_EQ(15u, handles.size());
+        EXPECT_EQ(16u, handles.size());
         auto& handle = handles.back();
         EXPECT_TRUE(handle);
         EXPECT_EQ(1u, handle.gridCount());
@@ -6217,8 +6375,8 @@ TEST_F(TestNanoVDB, HostBuffer)
         std::vector<nanovdb::GridHandle<> > gridHdls;
 
         // create two grids...
-        gridHdls.push_back(nanovdb::createLevelSetSphere<float >(100.0, nanovdb::Vec3d(-20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "spheref"));
-        gridHdls.push_back(nanovdb::createLevelSetSphere<double>(100.0, nanovdb::Vec3d( 20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "sphered"));
+        gridHdls.push_back(nanovdb::tools::createLevelSetSphere<float >(100.0, nanovdb::Vec3d(-20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "spheref"));
+        gridHdls.push_back(nanovdb::tools::createLevelSetSphere<double>(100.0, nanovdb::Vec3d( 20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "sphered"));
 
         EXPECT_TRUE(gridHdls[0]);
         auto* meta0 = gridHdls[0].gridMetaData();
@@ -6256,8 +6414,8 @@ TEST_F(TestNanoVDB, HostBuffer)
         std::vector<nanovdb::GridHandle<> > gridHdls;
 
         // create two grids...
-        gridHdls.push_back(nanovdb::createLevelSetSphere<float >(100.0, nanovdb::Vec3d(-20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "spheref", nanovdb::StatsMode::BBox, nanovdb::ChecksumMode::Partial, pool));
-        gridHdls.push_back(nanovdb::createLevelSetSphere<double>(100.0, nanovdb::Vec3d( 20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "sphered", nanovdb::StatsMode::BBox, nanovdb::ChecksumMode::Partial, pool));
+        gridHdls.push_back(nanovdb::tools::createLevelSetSphere<float >(100.0, nanovdb::Vec3d(-20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "spheref", nanovdb::tools::StatsMode::BBox, nanovdb::CheckMode::Partial, pool));
+        gridHdls.push_back(nanovdb::tools::createLevelSetSphere<double>(100.0, nanovdb::Vec3d( 20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "sphered", nanovdb::tools::StatsMode::BBox, nanovdb::CheckMode::Partial, pool));
 
         EXPECT_TRUE(gridHdls[0]);
         auto* meta0 = gridHdls[0].gridMetaData();
@@ -6337,8 +6495,8 @@ TEST_F(TestNanoVDB, HostBuffer)
         std::vector<nanovdb::GridHandle<> > gridHdls;
 
         // create two grids...
-        ASSERT_THROW(gridHdls.push_back(nanovdb::createLevelSetSphere<float>( 100.0f, nanovdb::Vec3d(-20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "spheref", nanovdb::StatsMode::BBox, nanovdb::ChecksumMode::Partial, pool)), std::runtime_error);
-        ASSERT_THROW(gridHdls.push_back(nanovdb::createLevelSetSphere<double>( 100.0,  nanovdb::Vec3d( 20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "sphered", nanovdb::StatsMode::BBox, nanovdb::ChecksumMode::Partial, pool)), std::runtime_error);
+        ASSERT_THROW(gridHdls.push_back(nanovdb::tools::createLevelSetSphere<float>( 100.0f, nanovdb::Vec3d(-20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "spheref", nanovdb::tools::StatsMode::BBox, nanovdb::CheckMode::Partial, pool)), std::runtime_error);
+        ASSERT_THROW(gridHdls.push_back(nanovdb::tools::createLevelSetSphere<double>( 100.0,  nanovdb::Vec3d( 20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "sphered", nanovdb::tools::StatsMode::BBox, nanovdb::CheckMode::Partial, pool)), std::runtime_error);
     }
     {// zero internal memory size
         ASSERT_THROW(nanovdb::HostBuffer::createPool(0), std::runtime_error);
@@ -6359,8 +6517,8 @@ TEST_F(TestNanoVDB, HostBuffer)
         std::vector<nanovdb::GridHandle<> > gridHdls;
 
         // create two grids...
-        gridHdls.push_back(nanovdb::createLevelSetSphere<float>( 100.0f, nanovdb::Vec3d(-20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "spheref", nanovdb::StatsMode::BBox, nanovdb::ChecksumMode::Partial, pool));
-        gridHdls.push_back(nanovdb::createLevelSetSphere<double>( 100.0,  nanovdb::Vec3d( 20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "sphered", nanovdb::StatsMode::BBox, nanovdb::ChecksumMode::Partial, pool));
+        gridHdls.push_back(nanovdb::tools::createLevelSetSphere<float>( 100.0f, nanovdb::Vec3d(-20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "spheref", nanovdb::tools::StatsMode::BBox, nanovdb::CheckMode::Partial, pool));
+        gridHdls.push_back(nanovdb::tools::createLevelSetSphere<double>( 100.0,  nanovdb::Vec3d( 20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "sphered", nanovdb::tools::StatsMode::BBox, nanovdb::CheckMode::Partial, pool));
 
         EXPECT_TRUE(gridHdls[0]);
         auto* meta0 = gridHdls[0].gridMetaData();
@@ -6415,8 +6573,8 @@ TEST_F(TestNanoVDB, HostBuffer)
         std::vector<nanovdb::GridHandle<> > gridHdls;
 
         // create two grids...
-        ASSERT_THROW(gridHdls.push_back(nanovdb::createLevelSetSphere<float>(  100.0, nanovdb::Vec3d(-20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "spheref", nanovdb::StatsMode::BBox, nanovdb::ChecksumMode::Partial, pool)), std::runtime_error);
-        ASSERT_THROW(gridHdls.push_back(nanovdb::createLevelSetSphere<double>( 100.0,  nanovdb::Vec3d( 20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "sphered", nanovdb::StatsMode::BBox, nanovdb::ChecksumMode::Partial, pool)), std::runtime_error);
+        ASSERT_THROW(gridHdls.push_back(nanovdb::tools::createLevelSetSphere<float>(  100.0, nanovdb::Vec3d(-20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "spheref", nanovdb::tools::StatsMode::BBox, nanovdb::CheckMode::Partial, pool)), std::runtime_error);
+        ASSERT_THROW(gridHdls.push_back(nanovdb::tools::createLevelSetSphere<double>( 100.0,  nanovdb::Vec3d( 20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "sphered", nanovdb::tools::StatsMode::BBox, nanovdb::CheckMode::Partial, pool)), std::runtime_error);
 
         EXPECT_FALSE(pool.isManaged());
         pool.resizePool(1<<26);// resize to 64 MB
@@ -6437,8 +6595,8 @@ TEST_F(TestNanoVDB, HostBuffer)
         EXPECT_FALSE(buffer.isFull());
         EXPECT_TRUE(buffer.isManaged());
 
-        gridHdls.push_back(nanovdb::createLevelSetSphere<float>( 100.0, nanovdb::Vec3d(-20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "spheref", nanovdb::StatsMode::BBox, nanovdb::ChecksumMode::Partial, pool));
-        gridHdls.push_back(nanovdb::createLevelSetSphere<double>( 100.0,  nanovdb::Vec3d( 20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "sphered", nanovdb::StatsMode::BBox, nanovdb::ChecksumMode::Partial, pool));
+        gridHdls.push_back(nanovdb::tools::createLevelSetSphere<float>( 100.0, nanovdb::Vec3d(-20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "spheref", nanovdb::tools::StatsMode::BBox, nanovdb::CheckMode::Partial, pool));
+        gridHdls.push_back(nanovdb::tools::createLevelSetSphere<double>( 100.0,  nanovdb::Vec3d( 20, 0, 0), 1.0, 3.0, nanovdb::Vec3d(0), "sphered", nanovdb::tools::StatsMode::BBox, nanovdb::CheckMode::Partial, pool));
 
         EXPECT_TRUE(gridHdls[0]);
         auto* meta0 = gridHdls[0].gridMetaData();
@@ -6483,23 +6641,25 @@ TEST_F(TestNanoVDB, HostBuffer)
         auto pool = nanovdb::HostBuffer::createPool(poolSize, nanovdb::alignPtr(array.get()));
         EXPECT_EQ(128ULL * 1024 * 1024, pool.poolSize());
         auto handles = nanovdb::io::readGrids("data/multi1.nvdb", 0, pool);
-        EXPECT_EQ(15u, handles.size());
+        EXPECT_EQ(16u, handles.size());
         for (auto &h : handles) EXPECT_TRUE(h);
         EXPECT_EQ(std::string("Int32 grid"), handles[0].grid<int>()->gridName());
         EXPECT_EQ(std::string("Int32 grid, empty"), handles[1].grid<int>()->gridName());
         EXPECT_EQ(std::string("Float vector grid"), handles[2].grid<nanovdb::Vec3f>()->gridName());
         EXPECT_EQ(std::string("Int64 grid"), handles[3].grid<int64_t>()->gridName());
-        EXPECT_EQ(std::string("Double grid"), handles[14].grid<double>()->gridName());
+        EXPECT_EQ(std::string("UInt8 grid"), handles[4].grid<uint8_t>()->gridName());
+        EXPECT_EQ(std::string("Double grid"), handles[15].grid<double>()->gridName());
         pool.reset();
         for (auto &h : handles) EXPECT_FALSE(h);
         handles = nanovdb::io::readGrids("data/multi1.nvdb", 0, pool);
-        EXPECT_EQ(15u, handles.size());
+        EXPECT_EQ(16u, handles.size());
         for (auto &h : handles) EXPECT_TRUE(h);
         EXPECT_EQ(std::string("Int32 grid"), handles[0].grid<int>()->gridName());
         EXPECT_EQ(std::string("Int32 grid, empty"), handles[1].grid<int>()->gridName());
         EXPECT_EQ(std::string("Float vector grid"), handles[2].grid<nanovdb::Vec3f>()->gridName());
         EXPECT_EQ(std::string("Int64 grid"), handles[3].grid<int64_t>()->gridName());
-        EXPECT_EQ(std::string("Double grid"), handles[14].grid<double>()->gridName());
+        EXPECT_EQ(std::string("UInt8 grid"), handles[4].grid<uint8_t>()->gridName());
+        EXPECT_EQ(std::string("Double grid"), handles[15].grid<double>()->gridName());
     } catch(const std::exception& e) {
         std::cout << "Unable to read \"data/multi1.nvdb\" for unit-test\n" << e.what() << std::endl;
     }
@@ -6513,7 +6673,7 @@ TEST_F(TestNanoVDB, NodeIterators)
     const float halfWidth = 3.0f;
     const nanovdb::Vec3d center(0);
     //mTimer.start("Create level set sphere");
-    auto handle1 = nanovdb::createLevelSetSphere(radius, center, voxelSize, halfWidth);
+    auto handle1 = nanovdb::tools::createLevelSetSphere(radius, center, voxelSize, halfWidth);
     //mTimer.stop();
     auto *fltGrid = handle1.grid<float>();
     EXPECT_TRUE(fltGrid);
@@ -6626,13 +6786,13 @@ TEST_F(TestNanoVDB, BasicValueIndexStats)
         EXPECT_EQ(64u, size4 - size3);// 512 bits = 64 bytes
     }
     EXPECT_TRUE(nanovdb::Version() >= nanovdb::Version(32,3,4));
-    using SrcGridT = nanovdb::build::Grid<float>;
+    using SrcGridT = nanovdb::tools::build::Grid<float>;
     SrcGridT srcGrid(0.0f);
     auto acc = srcGrid.getAccessor();
     const nanovdb::Coord ijk(0,0,1);
     acc.setValue(ijk, 1.0f);
 
-    auto handle1 = nanovdb::createNanoGrid(srcGrid);
+    auto handle1 = nanovdb::tools::createNanoGrid(srcGrid);
     auto *fltGrid = handle1.grid<float>();
     EXPECT_TRUE(fltGrid);
 
@@ -6648,7 +6808,7 @@ TEST_F(TestNanoVDB, BasicValueIndexStats)
     EXPECT_EQ(1.0f, fltGrid->tree().getValue(ijk));
     EXPECT_EQ(0.0f, fltGrid->tree().getValue(nanovdb::Coord(0,0,0)));
 
-    auto handle2 = nanovdb::createNanoGrid<nanovdb::FloatGrid, nanovdb::ValueIndex>(*fltGrid, 1u, true, true);
+    auto handle2 = nanovdb::tools::createNanoGrid<nanovdb::FloatGrid, nanovdb::ValueIndex>(*fltGrid, 1u, true, true);
     auto *idxGrid = handle2.grid<nanovdb::ValueIndex>();
     EXPECT_TRUE(idxGrid);
     EXPECT_EQ(1u, idxGrid->blindDataCount());
@@ -6714,13 +6874,13 @@ TEST_F(TestNanoVDB, BasicValueIndexStats)
 TEST_F(TestNanoVDB, BasicValueIndexStats2)
 {
     EXPECT_TRUE(nanovdb::Version() >= nanovdb::Version(32,3,4));
-    using SrcGridT = nanovdb::build::Grid<float>;
+    using SrcGridT = nanovdb::tools::build::Grid<float>;
     SrcGridT srcGrid(0.0f);
     auto acc = srcGrid.getAccessor();
     const nanovdb::Coord ijk(0,0,1);
     acc.setValue(ijk, 1.0f);
 
-    auto handle2 = nanovdb::createNanoGrid<SrcGridT, nanovdb::ValueIndex>(srcGrid, 1u, true, true);
+    auto handle2 = nanovdb::tools::createNanoGrid<SrcGridT, nanovdb::ValueIndex>(srcGrid, 1u, true, true);
     auto *idxGrid = handle2.grid<nanovdb::ValueIndex>();
     EXPECT_TRUE(idxGrid);
 
@@ -6784,12 +6944,12 @@ TEST_F(TestNanoVDB, BasicValueIndexStats2)
 
 TEST_F(TestNanoVDB, ValueMask2ValueIndex)
 {
-    using SrcGridT = nanovdb::build::Grid<nanovdb::ValueMask>;
+    using SrcGridT = nanovdb::tools::build::Grid<nanovdb::ValueMask>;
     SrcGridT srcGrid(true);
     auto acc = srcGrid.getAccessor();
     const nanovdb::Coord ijk(0,0,1);
     acc.setValue(ijk, true);
-    auto handle = nanovdb::createNanoGrid<SrcGridT, nanovdb::ValueIndex>(srcGrid, 0u, false, false);// no stats or tiles
+    auto handle = nanovdb::tools::createNanoGrid<SrcGridT, nanovdb::ValueIndex>(srcGrid, 0u, false, false);// no stats or tiles
     auto *idxGrid = handle.grid<nanovdb::ValueIndex>();
     EXPECT_TRUE(idxGrid);
     EXPECT_EQ(1u, idxGrid->activeVoxelCount());
@@ -6798,12 +6958,12 @@ TEST_F(TestNanoVDB, ValueMask2ValueIndex)
 
 TEST_F(TestNanoVDB, ValueMask2ValueOnIndex)
 {
-    using SrcGridT = nanovdb::build::Grid<nanovdb::ValueMask>;
+    using SrcGridT = nanovdb::tools::build::Grid<nanovdb::ValueMask>;
     SrcGridT srcGrid(true);
     auto acc = srcGrid.getAccessor();
     const nanovdb::Coord ijk(0,0,1);
     acc.setValue(ijk, true);
-    auto handle = nanovdb::createNanoGrid<SrcGridT, nanovdb::ValueOnIndex>(srcGrid, 0u, true, false);// stats but no tiles
+    auto handle = nanovdb::tools::createNanoGrid<SrcGridT, nanovdb::ValueOnIndex>(srcGrid, 0u, true, false);// stats but no tiles
     auto *idxGrid = handle.grid<nanovdb::ValueOnIndex>();
     EXPECT_TRUE(idxGrid);
     EXPECT_EQ(1u, idxGrid->activeVoxelCount());
@@ -6831,12 +6991,12 @@ TEST_F(TestNanoVDB, ValueMask2ValueOnIndex)
 TEST_F(TestNanoVDB, BasicValueIndexNoStats)
 {
     EXPECT_TRUE(nanovdb::Version() >= nanovdb::Version(32,3,4));
-    using SrcGridT = nanovdb::build::Grid<float>;
+    using SrcGridT = nanovdb::tools::build::Grid<float>;
     SrcGridT srcGrid(0.0f);
     auto acc = srcGrid.getAccessor();
     const nanovdb::Coord ijk(0,0,1);
     acc.setValue(ijk, 1.0f);
-    nanovdb::CreateNanoGrid<SrcGridT> converter(srcGrid);
+    nanovdb::tools::CreateNanoGrid<SrcGridT> converter(srcGrid);
     auto handle1 = converter.getHandle<float>();
     auto *fltGrid = handle1.grid<float>();
     EXPECT_TRUE(fltGrid);
@@ -6911,12 +7071,12 @@ TEST_F(TestNanoVDB, BasicValueIndexNoStats)
 TEST_F(TestNanoVDB, BasicValueIndexNoStatsNoTiles)
 {
     EXPECT_TRUE(nanovdb::Version() >= nanovdb::Version(32,3,4));
-    using SrcGridT = nanovdb::build::Grid<float>;
+    using SrcGridT = nanovdb::tools::build::Grid<float>;
     SrcGridT srcGrid(0.0f);
     auto acc = srcGrid.getAccessor();
     const nanovdb::Coord ijk(0,0,1);
     acc.setValue(ijk, 1.0f);
-    nanovdb::CreateNanoGrid<SrcGridT> converter(srcGrid);
+    nanovdb::tools::CreateNanoGrid<SrcGridT> converter(srcGrid);
 
     auto handle1 = converter.getHandle<float>();
     auto *fltGrid = handle1.grid<float>();
@@ -6997,12 +7157,12 @@ TEST_F(TestNanoVDB, BasicValueIndexNoStatsNoTiles)
 TEST_F(TestNanoVDB, SparseIndexGridBuilder1)
 {
     EXPECT_TRUE(nanovdb::Version() >= nanovdb::Version(32,3,4));
-     using SrcGridT = nanovdb::build::Grid<float>;
+     using SrcGridT = nanovdb::tools::build::Grid<float>;
     SrcGridT srcGrid(0.0f);
     auto acc = srcGrid.getAccessor();
     const nanovdb::Coord ijk(0,0,1);
     acc.setValue(ijk, 1.0f);
-    nanovdb::CreateNanoGrid<SrcGridT> converter(srcGrid);
+    nanovdb::tools::CreateNanoGrid<SrcGridT> converter(srcGrid);
     auto handle1 = converter.getHandle<float>();
     auto *fltGrid = handle1.grid<float>();
     EXPECT_TRUE(fltGrid);
@@ -7083,7 +7243,7 @@ TEST_F(TestNanoVDB, IndexGridBuilder2)
     const float halfWidth = 3.0f;
     const nanovdb::Vec3d center(0);
     //mTimer.start("Create level set sphere");
-    auto handle1 = nanovdb::createLevelSetSphere(radius, center, voxelSize, halfWidth);
+    auto handle1 = nanovdb::tools::createLevelSetSphere(radius, center, voxelSize, halfWidth);
     //mTimer.stop();
     auto *fltGrid = handle1.grid<float>();
     EXPECT_TRUE(fltGrid);
@@ -7092,7 +7252,7 @@ TEST_F(TestNanoVDB, IndexGridBuilder2)
     //std::cerr << "FloatGrid footprint: " << (fltGrid->gridSize()>>20) << "MB" << std::endl;
 
     // create an IndexGrid for the FloatGrid
-    nanovdb::CreateNanoGrid<nanovdb::FloatGrid> builder2(*fltGrid);
+    nanovdb::tools::CreateNanoGrid<nanovdb::FloatGrid> builder2(*fltGrid);
     //mTimer.start("Create IndexGrid");
     auto handle2 = builder2.getHandle<nanovdb::ValueIndex>(1u);
     //mTimer.stop();
@@ -7185,7 +7345,7 @@ TEST_F(TestNanoVDB, IndexGridBuilder2)
         }
         //mTimer.restart("Parallel bbox test of value buffer");
         // here is a multi-threaded version
-        nanovdb::forEach(idxGrid->indexBBox(),[&](const nanovdb::CoordBBox &bbox){
+        nanovdb::util::forEach(idxGrid->indexBBox(),[&](const nanovdb::CoordBBox &bbox){
             auto idxAcc = idxTree.getAccessor();// NOT thread-safe!
             auto fltAcc = fltTree.getAccessor();// NOT thread-safe!
             for (auto it = bbox.begin(); it; ++it) EXPECT_EQ(values[idxAcc.getValue(*it)], fltAcc.getValue(*it));
@@ -7211,7 +7371,7 @@ TEST_F(TestNanoVDB, IndexGridBuilder2)
         }// loop over leaf nodes
         //mTimer.restart("Parallel leaf iterator test of active voxels");
         auto *idxLeaf0 = idxTree.getFirstNode<0>();
-        nanovdb::forEach(nanovdb::Range1D(0,idxTree.nodeCount(0)),[&](const nanovdb::Range1D &r){
+        nanovdb::util::forEach(nanovdb::util::Range1D(0,idxTree.nodeCount(0)),[&](const nanovdb::util::Range1D &r){
             auto fltAcc = fltTree.getAccessor();// NOT thread-safe!
             for (auto i=r.begin(); i!=r.end(); ++i){
                 auto *idxLeaf = idxLeaf0 + i;
@@ -7226,7 +7386,7 @@ TEST_F(TestNanoVDB, IndexGridBuilder2)
         //mTimer.stop();
         //mTimer.start("Dense IndexGrid: Parallel leaf iterator test of active voxels");
         auto *leaf = idxTree.getFirstNode<0>();
-        nanovdb::forEach(nanovdb::Range1D(0,idxTree.nodeCount(0)),[&](const nanovdb::Range1D &r){
+        nanovdb::util::forEach(nanovdb::util::Range1D(0,idxTree.nodeCount(0)),[&](const nanovdb::util::Range1D &r){
             auto fltAcc = fltTree.getAccessor();// NOT thread-safe!
             for (auto i=r.begin(); i!=r.end(); ++i){
                 for (auto vox = leaf[i].beginValueOn(); vox; ++vox) {
@@ -7248,7 +7408,7 @@ TEST_F(TestNanoVDB, SparseIndexGridBuilder2)
     const float halfWidth = 3.0f;
     const nanovdb::Vec3d center(0);
     //mTimer.start("Create level set sphere");
-    auto handle1 = nanovdb::createLevelSetSphere(radius, center, voxelSize, halfWidth);
+    auto handle1 = nanovdb::tools::createLevelSetSphere(radius, center, voxelSize, halfWidth);
     //mTimer.stop();
     auto *fltGrid = handle1.grid<float>();
     EXPECT_TRUE(fltGrid);
@@ -7257,7 +7417,7 @@ TEST_F(TestNanoVDB, SparseIndexGridBuilder2)
     //std::cerr << "FloatGrid footprint: " << (fltGrid->gridSize()>>20) << "MB" << std::endl;
 
     // create an IndexGrid for the FloatGrid
-    nanovdb::CreateNanoGrid<nanovdb::FloatGrid> builder2(*fltGrid);
+    nanovdb::tools::CreateNanoGrid<nanovdb::FloatGrid> builder2(*fltGrid);
     //mTimer.start("Create IndexGrid");
     auto handle2 = builder2.getHandle<nanovdb::ValueOnIndex>(1u, false, true);
     //mTimer.stop();
@@ -7331,7 +7491,7 @@ TEST_F(TestNanoVDB, SparseIndexGridBuilder2)
         }
         //mTimer.restart("Parallel bbox test of value buffer");
         // here is a multi-threaded version
-        nanovdb::forEach(idxGrid->indexBBox(),[&](const nanovdb::CoordBBox &bbox){
+        nanovdb::util::forEach(idxGrid->indexBBox(),[&](const nanovdb::CoordBBox &bbox){
             auto idxAcc = idxTree.getAccessor();// NOT thread-safe!
             auto fltAcc = fltTree.getAccessor();// NOT thread-safe!
             uint64_t n;
@@ -7364,7 +7524,7 @@ TEST_F(TestNanoVDB, SparseIndexGridBuilder2)
         }// loop over leaf nodes
         //mTimer.start("Sparse IndexGrid: Parallel leaf iterator test of active voxels");
         auto *leaf = idxTree.getFirstNode<0>();
-        nanovdb::forEach(nanovdb::Range1D(0,idxTree.nodeCount(0)),[&](const nanovdb::Range1D &r){
+        nanovdb::util::forEach(nanovdb::util::Range1D(0,idxTree.nodeCount(0)),[&](const nanovdb::util::Range1D &r){
             auto fltAcc = fltTree.getAccessor();// NOT thread-safe!
             for (auto i=r.begin(); i!=r.end(); ++i){
                 for (auto vox = leaf[i].beginValueOn(); vox; ++vox) {
@@ -7386,7 +7546,7 @@ TEST_F(TestNanoVDB, ChannelIndexGridBuilder)
     const float halfWidth = 3.0f;
     const nanovdb::Vec3d center(0);
     //mTimer.start("Create level set sphere");
-    auto handle1 = nanovdb::createLevelSetSphere(radius, center, voxelSize, halfWidth);
+    auto handle1 = nanovdb::tools::createLevelSetSphere(radius, center, voxelSize, halfWidth);
     //mTimer.stop();
     auto *fltGrid = handle1.grid<float>();
     EXPECT_TRUE(fltGrid);
@@ -7395,7 +7555,7 @@ TEST_F(TestNanoVDB, ChannelIndexGridBuilder)
     //std::cerr << "FloatGrid footprint: " << (fltGrid->gridSize()>>20) << "MB" << std::endl;
 
     // create an IndexGrid for the FloatGrid
-    nanovdb::CreateNanoGrid<nanovdb::FloatGrid> builder2(*fltGrid);
+    nanovdb::tools::CreateNanoGrid<nanovdb::FloatGrid> builder2(*fltGrid);
     //mTimer.start("Create IndexGrid");
     auto handle2 = builder2.getHandle<nanovdb::ValueIndex>(channels, false);
     //mTimer.stop();
@@ -7430,7 +7590,7 @@ TEST_F(TestNanoVDB, ChannelIndexGridBuilder)
         //mTimer.start("Parallel leaf iterator test of active voxels in channel");
         const float *values = idxGrid->getBlindData<float>(i);
         EXPECT_TRUE(values);
-        nanovdb::forEach(0,idxTree.nodeCount(0),8,[&](const nanovdb::Range1D &r){
+        nanovdb::util::forEach(0,idxTree.nodeCount(0),8,[&](const nanovdb::util::Range1D &r){
             auto fltAcc = fltTree.getAccessor();// NOT thread-safe!
             for (auto i=r.begin(); i!=r.end(); ++i){
                 for (auto vox = leaf[i].beginValueOn(); vox; ++vox) {
@@ -7451,7 +7611,7 @@ TEST_F(TestNanoVDB, ChannelIndexGridBuilder)
         //mTimer.start("Parallel leaf iterator test of active voxels in channel");
         const float *values = idxGrid->getBlindData<float>(i);
         EXPECT_TRUE(values);
-        nanovdb::forEach(0,idxTree.nodeCount(0),8,[&](const nanovdb::Range1D &r){
+        nanovdb::util::forEach(0,idxTree.nodeCount(0),8,[&](const nanovdb::util::Range1D &r){
             nanovdb::ChannelAccessor<float> acc(*idxGrid, i);// NOT thread-safe
             EXPECT_TRUE(acc);
             auto fltAcc = fltTree.getAccessor();// NOT thread-safe!
@@ -7474,14 +7634,14 @@ TEST_F(TestNanoVDB, ChannelIndexGridBuilder)
 TEST_F(TestNanoVDB, HelloWorld_IndexGrid_Dense)
 {
     const nanovdb::Coord ijk(101,0,0);
-    auto handle1 = nanovdb::createLevelSetSphere<float>();
+    auto handle1 = nanovdb::tools::createLevelSetSphere<float>();
     auto *fltGrid = handle1.grid<float>();
     EXPECT_TRUE(fltGrid);
     //std::cerr << "Grid<float> size: " << (fltGrid->gridSize() >> 20) << " MB\n";
     EXPECT_EQ(1.0f, fltGrid->tree().getValue(ijk));
 
     {// create an IndexGrid with an internal channel and write it to file
-        nanovdb::io::writeGrid("data/index_grid.nvdb", nanovdb::createNanoGrid<nanovdb::FloatGrid, nanovdb::ValueIndex>(*fltGrid,1u, true, true));// 1 channel, include stats and tile values
+        nanovdb::io::writeGrid("data/index_grid.nvdb", nanovdb::tools::createNanoGrid<nanovdb::FloatGrid, nanovdb::ValueIndex>(*fltGrid,1u, true, true));// 1 channel, include stats and tile values
     }
     {// read and test IndexGrid
         auto tmp = nanovdb::io::readGrid("data/index_grid.nvdb");
@@ -7494,7 +7654,7 @@ TEST_F(TestNanoVDB, HelloWorld_IndexGrid_Dense)
         EXPECT_EQ(1.0f, acc(ijk));
 
         // compute the gradient from channel ID 0
-        nanovdb::GradStencil<nanovdb::ChannelAccessor<float>> stencil(acc);
+        nanovdb::math::GradStencil<nanovdb::ChannelAccessor<float>> stencil(acc);
         stencil.moveTo(ijk);
         EXPECT_EQ(nanovdb::Vec3f(1.0f,0.0f,0.0f), stencil.gradient());
 
@@ -7510,14 +7670,14 @@ TEST_F(TestNanoVDB, HelloWorld_IndexGrid_Dense)
 TEST_F(TestNanoVDB, HelloWorld_IndexGrid_Sparse)
 {
     const nanovdb::Coord ijk(101,0,0);
-    auto handle1 = nanovdb::createLevelSetSphere<float>();
+    auto handle1 = nanovdb::tools::createLevelSetSphere<float>();
     auto *fltGrid = handle1.grid<float>();
     EXPECT_TRUE(fltGrid);
     //std::cerr << "Grid<float> size: " << (fltGrid->gridSize() >> 20) << " MB\n";
     EXPECT_EQ(1.0f, fltGrid->tree().getValue(ijk));
 
     {// create an IndexGrid with an internal channel and write it to file
-        nanovdb::io::writeGrid("data/index_grid.nvdb", nanovdb::createNanoGrid<nanovdb::FloatGrid, nanovdb::ValueOnIndex>(*fltGrid, 1u, false, true));// 1 channel, no stats and include tile values
+        nanovdb::io::writeGrid("data/index_grid.nvdb", nanovdb::tools::createNanoGrid<nanovdb::FloatGrid, nanovdb::ValueOnIndex>(*fltGrid, 1u, false, true));// 1 channel, no stats and include tile values
     }
     {// read and test IndexGrid
         auto tmp = nanovdb::io::readGrid("data/index_grid.nvdb");
@@ -7530,7 +7690,7 @@ TEST_F(TestNanoVDB, HelloWorld_IndexGrid_Sparse)
         EXPECT_EQ(1.0f, acc(ijk));
 
         // compute the gradient from channel ID 0
-        nanovdb::GradStencil<nanovdb::ChannelAccessor<float, nanovdb::ValueOnIndex>> stencil(acc);
+        nanovdb::math::GradStencil<nanovdb::ChannelAccessor<float, nanovdb::ValueOnIndex>> stencil(acc);
         stencil.moveTo(ijk);
         EXPECT_EQ(nanovdb::Vec3f(1.0f,0.0f,0.0f), stencil.gradient());
 
@@ -7546,14 +7706,14 @@ TEST_F(TestNanoVDB, HelloWorld_IndexGrid_Sparse)
 TEST_F(TestNanoVDB, HelloWorld_IndexGrid_Sparse2)
 {
     const nanovdb::Coord ijk(101,0,0);
-    auto handle1 = nanovdb::createLevelSetSphere<float>();
+    auto handle1 = nanovdb::tools::createLevelSetSphere<float>();
     auto *fltGrid = handle1.grid<float>();
     EXPECT_TRUE(fltGrid);
     //std::cerr << "Grid<float> size: " << (fltGrid->gridSize() >> 20) << " MB\n";
     EXPECT_EQ(1.0f, fltGrid->tree().getValue(ijk));
 
     {// create an IndexGrid with an internal channel and write it to file
-        nanovdb::io::writeGrid("data/index_grid2.nvdb", nanovdb::createNanoGrid<nanovdb::FloatGrid, nanovdb::ValueOnIndex>(*fltGrid, 1u, false, false));// 1 channel, no stats and no tile values
+        nanovdb::io::writeGrid("data/index_grid2.nvdb", nanovdb::tools::createNanoGrid<nanovdb::FloatGrid, nanovdb::ValueOnIndex>(*fltGrid, 1u, false, false));// 1 channel, no stats and no tile values
     }
     {// read and test IndexGrid
         auto tmp = nanovdb::io::readGrid("data/index_grid2.nvdb");
@@ -7566,7 +7726,7 @@ TEST_F(TestNanoVDB, HelloWorld_IndexGrid_Sparse2)
         EXPECT_EQ(1.0f, acc(ijk));
 
         // compute the gradient from channel ID 0
-        nanovdb::GradStencil<nanovdb::ChannelAccessor<float, nanovdb::ValueOnIndex>> stencil(acc);
+        nanovdb::math::GradStencil<nanovdb::ChannelAccessor<float, nanovdb::ValueOnIndex>> stencil(acc);
         stencil.moveTo(ijk);
         EXPECT_EQ(nanovdb::Vec3f(1.0f,0.0f,0.0f), stencil.gradient());
 
@@ -7584,7 +7744,7 @@ TEST_F(TestNanoVDB, writeReadUncompressedGrid)
     using GridHandleT = nanovdb::GridHandle<nanovdb::HostBuffer>;
     const nanovdb::Coord ijk(101,0,0);
     std::vector<GridHandleT> handles1;
-    handles1.emplace_back(nanovdb::createLevelSetSphere<float>());
+    handles1.emplace_back(nanovdb::tools::createLevelSetSphere<float>());
     EXPECT_EQ(1u, handles1.size());
     auto *fltGrid1 = handles1[0].grid<float>();
     EXPECT_TRUE(fltGrid1);
@@ -7605,7 +7765,7 @@ TEST_F(TestNanoVDB, writeReadUncompressedGridRaw)
     using GridHandleT = nanovdb::GridHandle<nanovdb::HostBuffer>;
     const nanovdb::Coord ijk(101,0,0);
     std::vector<GridHandleT> handles1;
-    handles1.emplace_back(nanovdb::createLevelSetSphere<float>());
+    handles1.emplace_back(nanovdb::tools::createLevelSetSphere<float>());
     EXPECT_EQ(1u, handles1.size());
     auto *fltGrid1 = handles1[0].grid<float>();
     EXPECT_TRUE(fltGrid1);
@@ -7623,7 +7783,7 @@ TEST_F(TestNanoVDB, writeReadUncompressedGridRaw)
 
 TEST_F(TestNanoVDB, GridMetaData)
 {
-    auto handle = nanovdb::createLevelSetSphere<float>();
+    auto handle = nanovdb::tools::createLevelSetSphere<float>();
     auto *grid = handle.grid<float>();
     EXPECT_TRUE(grid);
     EXPECT_TRUE(grid->isRootConnected());
@@ -7638,7 +7798,7 @@ TEST_F(TestNanoVDB, GridMetaData)
 TEST_F(TestNanoVDB, BuildTree)
 {
     nanovdb::CoordBBox bbox(nanovdb::Coord(0), nanovdb::Coord(511));
-    nanovdb::build::Grid<nanovdb::ValueMask> grid1(false), grid2(false);
+    nanovdb::tools::build::Grid<nanovdb::ValueMask> grid1(false), grid2(false);
     {
         //mTimer.start("Serial build::Tree");
         auto kernel = [&](const nanovdb::CoordBBox& bbox) {
@@ -7654,7 +7814,7 @@ TEST_F(TestNanoVDB, BuildTree)
             auto acc = grid2.getWriteAccessor();
             for (auto it = bbox.begin(); it; ++it) acc.setValueOn(*it);
         };
-        nanovdb::forEach(bbox, kernel);
+        nanovdb::util::forEach(bbox, kernel);
         //mTimer.stop();
     }
     {
@@ -7670,20 +7830,20 @@ TEST_F(TestNanoVDB, CreateNanoGridFromFloat)
     using SrcGridT = nanovdb::FloatGrid;
     const float tolerance = 0.001f;
     const nanovdb::Coord ijk(101,0,0);
-    auto srcHandle = nanovdb::createLevelSetSphere<float>();
+    auto srcHandle = nanovdb::tools::createLevelSetSphere<float>();
     SrcGridT *srcGrid = srcHandle.grid<float>();
     EXPECT_TRUE(srcGrid);
     //std::cerr << "Grid<float> size: " << (srcGrid->gridSize() >> 20) << " MB\n";
     EXPECT_EQ(1.0f, srcGrid->tree().getValue(ijk));
 
-    nanovdb::CreateNanoGrid<SrcGridT> converter(*srcGrid);
+    nanovdb::tools::CreateNanoGrid<SrcGridT> converter(*srcGrid);
 
     {// create nanovdb::FloatGrid from nanovdb::FloatGrid
         using DstBuildT = float;
         auto dstHandle = converter.getHandle<DstBuildT>();
         auto *dstGrid = dstHandle.grid<DstBuildT>();
         EXPECT_TRUE(dstGrid);
-        //std::cerr << "Grid<"<<nanovdb::toStr(nanovdb::mapToGridType<DstBuildT>())<<"> size: " << (dstGrid->gridSize() >> 20) << " MB\n";
+        //std::cerr << "Grid<"<<nanovdb::toStr(nanovdb::toGridType<DstBuildT>())<<"> size: " << (dstGrid->gridSize() >> 20) << " MB\n";
         EXPECT_EQ(1.0f, dstGrid->tree().getValue(ijk));
     }
     {// create nanovdb::DoubleGrid from nanovdb::FloatGrid
@@ -7691,7 +7851,7 @@ TEST_F(TestNanoVDB, CreateNanoGridFromFloat)
         auto dstHandle = converter.getHandle<DstBuildT>();
         auto *dstGrid = dstHandle.grid<DstBuildT>();
         EXPECT_TRUE(dstGrid);
-        //std::cerr << "Grid<"<<nanovdb::toStr(nanovdb::mapToGridType<DstBuildT>())<<"> size: " << (dstGrid->gridSize() >> 20) << " MB\n";
+        //std::cerr << "Grid<"<<nanovdb::toStr(nanovdb::toGridType<DstBuildT>())<<"> size: " << (dstGrid->gridSize() >> 20) << " MB\n";
         EXPECT_EQ(1.0, dstGrid->tree().getValue(ijk));
     }
     {// create nanovdb::Fp4Grid from nanovdb::FloatGrid
@@ -7699,7 +7859,7 @@ TEST_F(TestNanoVDB, CreateNanoGridFromFloat)
         auto dstHandle = converter.getHandle<DstBuildT>();
         auto *dstGrid = dstHandle.grid<DstBuildT>();
         EXPECT_TRUE(dstGrid);
-        //std::cerr << "Grid<"<<nanovdb::toStr(nanovdb::mapToGridType<DstBuildT>())<<"> size: " << (dstGrid->gridSize() >> 20) << " MB\n";
+        //std::cerr << "Grid<"<<nanovdb::toStr(nanovdb::toGridType<DstBuildT>())<<"> size: " << (dstGrid->gridSize() >> 20) << " MB\n";
         EXPECT_NEAR(1.0f, dstGrid->tree().getValue(ijk), tolerance);
         //EXPECT_EQ(1.0f, dstGrid->tree().getValue(ijk));
     }
@@ -7708,7 +7868,7 @@ TEST_F(TestNanoVDB, CreateNanoGridFromFloat)
         auto dstHandle = converter.getHandle<DstBuildT>();
         auto *dstGrid = dstHandle.grid<DstBuildT>();
         EXPECT_TRUE(dstGrid);
-        //std::cerr << "Grid<"<<nanovdb::toStr(nanovdb::mapToGridType<DstBuildT>())<<"> size: " << (dstGrid->gridSize() >> 20) << " MB\n";
+        //std::cerr << "Grid<"<<nanovdb::toStr(nanovdb::toGridType<DstBuildT>())<<"> size: " << (dstGrid->gridSize() >> 20) << " MB\n";
         EXPECT_NEAR(1.0f, dstGrid->tree().getValue(ijk), tolerance);
         //EXPECT_EQ(1.0f, dstGrid->tree().getValue(ijk));
     }
@@ -7717,7 +7877,7 @@ TEST_F(TestNanoVDB, CreateNanoGridFromFloat)
         auto dstHandle = converter.getHandle<DstBuildT>();
         auto *dstGrid = dstHandle.grid<DstBuildT>();
         EXPECT_TRUE(dstGrid);
-        //std::cerr << "Grid<"<<nanovdb::toStr(nanovdb::mapToGridType<DstBuildT>())<<"> size: " << (dstGrid->gridSize() >> 20) << " MB\n";
+        //std::cerr << "Grid<"<<nanovdb::toStr(nanovdb::toGridType<DstBuildT>())<<"> size: " << (dstGrid->gridSize() >> 20) << " MB\n";
         EXPECT_NEAR(1.0f, dstGrid->tree().getValue(ijk), tolerance);
         //EXPECT_EQ(1.0f, dstGrid->tree().getValue(ijk));
     }
@@ -7726,7 +7886,7 @@ TEST_F(TestNanoVDB, CreateNanoGridFromFloat)
         auto dstHandle = converter.getHandle<DstBuildT>();
         auto *dstGrid = dstHandle.grid<DstBuildT>();
         EXPECT_TRUE(dstGrid);
-        //std::cerr << "Grid<"<<nanovdb::toStr(nanovdb::mapToGridType<DstBuildT>())<<"> size: " << (dstGrid->gridSize() >> 20) << " MB\n";
+        //std::cerr << "Grid<"<<nanovdb::toStr(nanovdb::toGridType<DstBuildT>())<<"> size: " << (dstGrid->gridSize() >> 20) << " MB\n";
         EXPECT_NEAR(1.0f, dstGrid->tree().getValue(ijk), tolerance);
         //EXPECT_EQ(1.0f, dstGrid->tree().getValue(ijk));
     }
@@ -7735,7 +7895,7 @@ TEST_F(TestNanoVDB, CreateNanoGridFromFloat)
         auto dstHandle = converter.getHandle<DstBuildT>();
         auto *dstGrid = dstHandle.grid<DstBuildT>();
         EXPECT_TRUE(dstGrid);
-        //std::cerr << "Grid<"<<nanovdb::toStr(nanovdb::mapToGridType<DstBuildT>())<<"> size: " << (dstGrid->gridSize() >> 20) << " MB\n";
+        //std::cerr << "Grid<"<<nanovdb::toStr(nanovdb::toGridType<DstBuildT>())<<"> size: " << (dstGrid->gridSize() >> 20) << " MB\n";
         EXPECT_EQ(true, dstGrid->tree().getValue(ijk));
     }
 }// CreateNanoGridFromFloat
@@ -7743,7 +7903,7 @@ TEST_F(TestNanoVDB, CreateNanoGridFromFloat)
 TEST_F(TestNanoVDB, CreateNanoGridFromVec3f)
 {
     using SrcBuildT = nanovdb::Vec3f;
-    using SrcGridT = nanovdb::build::Grid<SrcBuildT>;
+    using SrcGridT = nanovdb::tools::build::Grid<SrcBuildT>;
 
     //
     const SrcBuildT a(1.5f,0.0f,-9.1f), b(0.0f,0.0f,0.0f);
@@ -7753,15 +7913,15 @@ TEST_F(TestNanoVDB, CreateNanoGridFromVec3f)
     EXPECT_EQ(a, grid.tree().getValue(p));
     EXPECT_EQ(b, grid.tree().getValue(q));
     //
-    auto srcHandle = nanovdb::createNanoGrid<SrcGridT>(grid);
+    auto srcHandle = nanovdb::tools::createNanoGrid<SrcGridT>(grid);
     auto *srcGrid = srcHandle.grid<SrcBuildT>();
     EXPECT_TRUE(srcGrid);
     EXPECT_EQ(a, srcGrid->tree().getValue(p));
     EXPECT_EQ(b, srcGrid->tree().getValue(q));
 
-    {// create nanovdb::ValueIndexGrid from nanovdb::build::Grid<Vec3f>
+    {// create nanovdb::ValueIndexGrid from nanovdb::tools::build::Grid<Vec3f>
         using DstBuildT = nanovdb::ValueIndex;
-        auto handle = nanovdb::createNanoGrid<SrcGridT, DstBuildT>(grid, 0u, false, false);// no channels, stats or tiles
+        auto handle = nanovdb::tools::createNanoGrid<SrcGridT, DstBuildT>(grid, 0u, false, false);// no channels, stats or tiles
         auto *idxGrid = handle.grid<DstBuildT>();
         EXPECT_TRUE(idxGrid);
         EXPECT_EQ(1u, idxGrid->activeVoxelCount());
@@ -7769,9 +7929,9 @@ TEST_F(TestNanoVDB, CreateNanoGridFromVec3f)
         EXPECT_EQ(1, idxGrid->tree().getValue(q));
         EXPECT_EQ(8, idxGrid->tree().getValue(p));
     }
-    {// create nanovdb::ValueOnIndexGrid from nanovdb::build::Grid<Vec3f>
+    {// create nanovdb::ValueOnIndexGrid from nanovdb::tools::build::Grid<Vec3f>
         using DstBuildT = nanovdb::ValueOnIndex;
-        auto handle = nanovdb::createNanoGrid<SrcGridT, DstBuildT>(grid, 0u, false, false);// no channels, stats or tiles
+        auto handle = nanovdb::tools::createNanoGrid<SrcGridT, DstBuildT>(grid, 0u, false, false);// no channels, stats or tiles
         auto *idxGrid = handle.grid<DstBuildT>();
         EXPECT_TRUE(idxGrid);
         EXPECT_EQ(1u, idxGrid->activeVoxelCount());
@@ -7782,7 +7942,7 @@ TEST_F(TestNanoVDB, CreateNanoGridFromVec3f)
     {// create nanovdb::ValueIndexGrid from nanovdb::Grid<Vec3f>
         using DstBuildT = nanovdb::ValueIndex;
         using SrcGridT = nanovdb::Vec3fGrid;
-        auto handle = nanovdb::createNanoGrid<SrcGridT, DstBuildT>(*srcGrid, 0u, false, false);// no channels, stats or tiles
+        auto handle = nanovdb::tools::createNanoGrid<SrcGridT, DstBuildT>(*srcGrid, 0u, false, false);// no channels, stats or tiles
         auto *idxGrid = handle.grid<DstBuildT>();
         EXPECT_TRUE(idxGrid);
         EXPECT_EQ(1u, idxGrid->activeVoxelCount());
@@ -7793,7 +7953,7 @@ TEST_F(TestNanoVDB, CreateNanoGridFromVec3f)
     {// create nanovdb::ValueOnIndexGrid from nanovdb::Grid<Vec3f>
         using DstBuildT = nanovdb::ValueOnIndex;
         using SrcGridT = nanovdb::Vec3fGrid;
-        auto handle = nanovdb::createNanoGrid<SrcGridT, DstBuildT>(*srcGrid, 0u, false, false);// no channels, stats or tiles
+        auto handle = nanovdb::tools::createNanoGrid<SrcGridT, DstBuildT>(*srcGrid, 0u, false, false);// no channels, stats or tiles
         auto *idxGrid = handle.grid<DstBuildT>();
         EXPECT_TRUE(idxGrid);
         EXPECT_EQ(1u, idxGrid->activeVoxelCount());
@@ -7805,7 +7965,7 @@ TEST_F(TestNanoVDB, CreateNanoGridFromVec3f)
 
 TEST_F(TestNanoVDB, LongGridName)
 {
-    using SrcGridT = nanovdb::build::Grid<float>;
+    using SrcGridT = nanovdb::tools::build::Grid<float>;
     nanovdb::GridData tmp;
     tmp.init();
     EXPECT_EQ('\0', tmp.mGridName[0]);
@@ -7823,7 +7983,7 @@ TEST_F(TestNanoVDB, LongGridName)
         EXPECT_EQ(gridName, srcGrid.getName());
         srcGrid.tree().setValue(nanovdb::Coord(-256), 10.0f);
         const bool isLong = length > limit;
-        auto handle = nanovdb::createNanoGrid(srcGrid);
+        auto handle = nanovdb::tools::createNanoGrid(srcGrid);
         auto* dstGrid = handle.grid<float>();
         EXPECT_TRUE(dstGrid);
         EXPECT_EQ(1u, dstGrid->activeVoxelCount());
@@ -7870,10 +8030,10 @@ TEST_F(TestNanoVDB, mergeSplitGrids)
     size_t size1 = 0, size2 = 0;
     std::vector<nanovdb::GridHandle<>> handles1, handles2;
     std::vector<std::string> gridNames;
-    //nanovdb::CpuTimer timer("create 5 host grids");
+    //nanovdb::util::Timer timer("create 5 host grids");
     for (int radius = 100; radius<150; radius += 10) {
         gridNames.emplace_back("sphere_" + std::to_string(radius));
-        handles1.emplace_back(nanovdb::createLevelSetSphere(radius,nanovdb::Vec3d(0),1,3,
+        handles1.emplace_back(nanovdb::tools::createLevelSetSphere(radius,nanovdb::Vec3d(0),1,3,
                                                             nanovdb::Vec3d(0), gridNames.back()));
         EXPECT_FALSE(handles1.back().isPadded());
         size1 += handles1.back().size();
@@ -7883,7 +8043,7 @@ TEST_F(TestNanoVDB, mergeSplitGrids)
     //timer.restart("create 5 host grids");
     for (int radius = 150; radius<200; radius += 10) {
         gridNames.emplace_back("sphere_" + std::to_string(radius));
-        handles2.emplace_back(nanovdb::createLevelSetSphere(radius,nanovdb::Vec3d(0),1,3,
+        handles2.emplace_back(nanovdb::tools::createLevelSetSphere(radius,nanovdb::Vec3d(0),1,3,
                                                             nanovdb::Vec3d(0), gridNames.back()));
         size2 += handles2.back().size();
     }
@@ -7959,17 +8119,17 @@ TEST_F(TestNanoVDB, mergeSplitGrids)
     //timer.stop();
 }//  mergeSplitGrids
 
-TEST_F(TestNanoVDB, writeReadRadGrid)
+TEST_F(TestNanoVDB, writeReadGridBuffer)
 {
     const nanovdb::Coord ijk(101,0,0);
-    auto handle1 = nanovdb::createLevelSetSphere<float>();
+    auto handle1 = nanovdb::tools::createLevelSetSphere<float>();
     auto *fltGrid = handle1.grid<float>();
     EXPECT_TRUE(fltGrid);
     //std::cerr << "Grid<float> size: " << (fltGrid->gridSize() >> 20) << " MB\n";
     EXPECT_EQ(1.0f, fltGrid->tree().getValue(ijk));
 
     {// create an IndexGrid with an internal channel and write it to file
-        auto handle = nanovdb::createNanoGrid<nanovdb::FloatGrid, nanovdb::ValueIndex>(*fltGrid,1u, true, true);// 1 channel, include stats and tile values
+        auto handle = nanovdb::tools::createNanoGrid<nanovdb::FloatGrid, nanovdb::ValueIndex>(*fltGrid,1u, true, true);// 1 channel, include stats and tile values
         handle.write("data/raw_grid.nvdb");
     }
     {// read and test IndexGrid
@@ -7988,7 +8148,7 @@ TEST_F(TestNanoVDB, writeReadRadGrid)
         EXPECT_EQ(1.0f, acc(ijk));
 
         // compute the gradient from channel ID 0
-        nanovdb::GradStencil<nanovdb::ChannelAccessor<float>> stencil(acc);
+        nanovdb::math::GradStencil<nanovdb::ChannelAccessor<float>> stencil(acc);
         stencil.moveTo(ijk);
         EXPECT_EQ(nanovdb::Vec3f(1.0f,0.0f,0.0f), stencil.gradient());
 
@@ -7999,11 +8159,11 @@ TEST_F(TestNanoVDB, writeReadRadGrid)
         stencil.moveTo(ijk);// re-populates the stencil cache
         EXPECT_EQ(nanovdb::Vec3f(0.5f,0.0f,0.0f), stencil.gradient());
     }
-}// writeReadRadGrid
+}// writeReadGridBuffer
 
 TEST_F(TestNanoVDB, GridHandleIO)
 {
-    auto handle = nanovdb::createLevelSetSphere<float>();
+    auto handle = nanovdb::tools::createLevelSetSphere<float>();
     EXPECT_TRUE(handle.grid<float>());
     handle.write("data/sphere_raw.nvdb");
     ASSERT_THROW(handle.read("data/dummy_raw.nvdb"), std::ios_base::failure);
@@ -8016,15 +8176,15 @@ TEST_F(TestNanoVDB, GridHandleIO)
     EXPECT_TRUE(handle.grid<nanovdb::ValueIndex>());
     ASSERT_THROW(handle.read("data/merge1.nvdb"), std::logic_error);
     ASSERT_THROW(handle.read("data/merge1.nvdb"), std::exception);
-}
+}// GridHandleIO
 
 TEST_F(TestNanoVDB, GridCountAndIndex)
 {
     {// create multiple grids and write them to file
         std::vector<nanovdb::GridHandle<>> handles;
-        handles.emplace_back(nanovdb::createLevelSetSphere<float>());
-        handles.emplace_back(nanovdb::createLevelSetSphere<float>());
-        handles.emplace_back(nanovdb::createLevelSetSphere<float>());
+        handles.emplace_back(nanovdb::tools::createLevelSetSphere<float>());
+        handles.emplace_back(nanovdb::tools::createLevelSetSphere<float>());
+        handles.emplace_back(nanovdb::tools::createLevelSetSphere<float>());
         EXPECT_EQ(3u, handles.size());
         for (auto &h : handles) EXPECT_EQ(1u, h.gridCount());
         nanovdb::io::writeGrids<nanovdb::HostBuffer, std::vector>("data/3_spheres.nvdb", handles);
@@ -8036,8 +8196,8 @@ TEST_F(TestNanoVDB, GridCountAndIndex)
         EXPECT_TRUE(grid);
         EXPECT_EQ(0u, grid->gridIndex());
         EXPECT_EQ(1u, grid->gridCount());
-        EXPECT_TRUE(nanovdb::validateChecksum(*grid));
-        EXPECT_TRUE(nanovdb::validateChecksum(*grid, nanovdb::ChecksumMode::Full));
+        EXPECT_TRUE(nanovdb::tools::validateChecksum(grid));
+        EXPECT_TRUE(nanovdb::tools::validateChecksum(grid, nanovdb::CheckMode::Full));
     }
     {// readGrid one by one
         for (uint32_t i=0; i<3u; ++i) {
@@ -8047,8 +8207,8 @@ TEST_F(TestNanoVDB, GridCountAndIndex)
             EXPECT_TRUE(grid);
             EXPECT_EQ(0u, grid->gridIndex());
             EXPECT_EQ(1u, grid->gridCount());
-            EXPECT_TRUE(nanovdb::validateChecksum(*grid));
-            EXPECT_TRUE(nanovdb::validateChecksum(*grid, nanovdb::ChecksumMode::Full));
+            EXPECT_TRUE(nanovdb::tools::validateChecksum(grid));
+            EXPECT_TRUE(nanovdb::tools::validateChecksum(grid, nanovdb::CheckMode::Full));
         }
     }
     {// read all grids
@@ -8060,8 +8220,8 @@ TEST_F(TestNanoVDB, GridCountAndIndex)
             EXPECT_TRUE(grid);
             EXPECT_EQ(i,  grid->gridIndex());
             EXPECT_EQ(3u, grid->gridCount());
-            EXPECT_TRUE(nanovdb::validateChecksum(*grid));
-            EXPECT_TRUE(nanovdb::validateChecksum(*grid, nanovdb::ChecksumMode::Full));
+            EXPECT_TRUE(nanovdb::tools::validateChecksum(grid));
+            EXPECT_TRUE(nanovdb::tools::validateChecksum(grid, nanovdb::CheckMode::Full));
         }
     }
     {// read all raw grids
@@ -8073,8 +8233,8 @@ TEST_F(TestNanoVDB, GridCountAndIndex)
             EXPECT_TRUE(grid);
             EXPECT_EQ(i,  grid->gridIndex());
             EXPECT_EQ(3u, grid->gridCount());
-            EXPECT_TRUE(nanovdb::validateChecksum(*grid));
-            EXPECT_TRUE(nanovdb::validateChecksum(*grid, nanovdb::ChecksumMode::Full));
+            EXPECT_TRUE(nanovdb::tools::validateChecksum(grid));
+            EXPECT_TRUE(nanovdb::tools::validateChecksum(grid, nanovdb::CheckMode::Full));
         }
     }
     {// read all raw grids
@@ -8086,8 +8246,8 @@ TEST_F(TestNanoVDB, GridCountAndIndex)
             EXPECT_TRUE(grid);
             EXPECT_EQ(i,  grid->gridIndex());
             EXPECT_EQ(3u, grid->gridCount());
-            EXPECT_TRUE(nanovdb::validateChecksum(*grid));
-            EXPECT_TRUE(nanovdb::validateChecksum(*grid, nanovdb::ChecksumMode::Full));
+            EXPECT_TRUE(nanovdb::tools::validateChecksum(grid));
+            EXPECT_TRUE(nanovdb::tools::validateChecksum(grid, nanovdb::CheckMode::Full));
         }
     }
     {// read single raw grid
@@ -8099,8 +8259,8 @@ TEST_F(TestNanoVDB, GridCountAndIndex)
             EXPECT_TRUE(grid);
             EXPECT_EQ(0u,  grid->gridIndex());
             EXPECT_EQ(1u, grid->gridCount());
-            EXPECT_TRUE(nanovdb::validateChecksum(*grid));
-            EXPECT_TRUE(nanovdb::validateChecksum(*grid, nanovdb::ChecksumMode::Full));
+            EXPECT_TRUE(nanovdb::tools::validateChecksum(grid));
+            EXPECT_TRUE(nanovdb::tools::validateChecksum(grid, nanovdb::CheckMode::Full));
         }
         ASSERT_THROW(handle.read("data/3_spheres_raw.nvdb", 4), std::runtime_error);
         ASSERT_THROW(handle.read("data/3_spheres_raw.nvdb",-1), std::runtime_error);
@@ -8113,8 +8273,8 @@ TEST_F(TestNanoVDB, GridCountAndIndex)
             EXPECT_TRUE(grid);
             EXPECT_EQ(0u, grid->gridIndex());
             EXPECT_EQ(1u, grid->gridCount());
-            EXPECT_TRUE(nanovdb::validateChecksum(*grid));
-            EXPECT_TRUE(nanovdb::validateChecksum(*grid, nanovdb::ChecksumMode::Full));
+            EXPECT_TRUE(nanovdb::tools::validateChecksum(grid));
+            EXPECT_TRUE(nanovdb::tools::validateChecksum(grid, nanovdb::CheckMode::Full));
         }
         ASSERT_THROW(nanovdb::io::readGrid("data/3_spheres_raw.nvdb", 4), std::runtime_error);
     }
@@ -8125,7 +8285,7 @@ TEST_F(TestNanoVDB, CustomStreamIO)
     std::ostringstream outputStream(std::ios_base::out | std::ios_base::binary);
     {
         std::vector<nanovdb::GridHandle<nanovdb::HostBuffer>> handles;
-        handles.emplace_back(nanovdb::createLevelSetSphere<float>());
+        handles.emplace_back(nanovdb::tools::createLevelSetSphere<float>());
         EXPECT_EQ(1u, handles.size());
         nanovdb::io::writeGrids<nanovdb::HostBuffer, std::vector>(outputStream, handles, nanovdb::io::Codec::NONE);
     }
@@ -8143,8 +8303,8 @@ TEST_F(TestNanoVDB, CustomStreamIO)
         EXPECT_TRUE(grid);
         EXPECT_EQ(0u, grid->gridIndex());
         EXPECT_EQ(1u, grid->gridCount());
-        EXPECT_TRUE(nanovdb::validateChecksum(*grid));
-        EXPECT_TRUE(nanovdb::validateChecksum(*grid, nanovdb::ChecksumMode::Full));
+        EXPECT_TRUE(nanovdb::tools::validateChecksum(grid));
+        EXPECT_TRUE(nanovdb::tools::validateChecksum(grid, nanovdb::CheckMode::Full));
     }
 }// CustomStreamIO
 
@@ -8152,7 +8312,7 @@ TEST_F(TestNanoVDB, CustomStreamGridHandleIO)
 {
     std::ostringstream outputStream(std::ios_base::out | std::ios_base::binary);
     {
-        nanovdb::createLevelSetSphere<float>().write(outputStream);
+        nanovdb::tools::createLevelSetSphere<float>().write(outputStream);
     }
 
     std::string payload = outputStream.str();
@@ -8168,11 +8328,158 @@ TEST_F(TestNanoVDB, CustomStreamGridHandleIO)
         EXPECT_TRUE(grid);
         EXPECT_EQ(0u, grid->gridIndex());
         EXPECT_EQ(1u, grid->gridCount());
-        EXPECT_TRUE(nanovdb::validateChecksum(*grid));
-        EXPECT_TRUE(nanovdb::validateChecksum(*grid, nanovdb::ChecksumMode::Full));
+        EXPECT_TRUE(nanovdb::tools::validateChecksum(grid));
+        EXPECT_TRUE(nanovdb::tools::validateChecksum(grid, nanovdb::CheckMode::Full));
     }
 }// CustomStreamGridHandleIO
 
+// make -j testNanoVDB && ./unittest/testNanoVDB --gtest_filter="*strcpy"
+TEST_F(TestNanoVDB, strcpy)
+{
+    EXPECT_EQ(mStr, nanovdb::util::strcpy(mStr, "this is a test"));
+    //std::cerr << "mStr = \"" << mStr << "\"" << std::endl;
+    EXPECT_TRUE(nanovdb::util::streq(mStr,  "this is a test"));
+    EXPECT_EQ(nanovdb::util::strlen(mStr), std::strlen("this is a test"));
+
+    EXPECT_EQ(mStr, nanovdb::util::strcpy(mStr, "this is a test 2"));
+    //std::cerr << "mStr = \"" << mStr << "\"" << std::endl;
+    EXPECT_TRUE(nanovdb::util::streq(mStr,  "this is a test 2"));
+    EXPECT_EQ(nanovdb::util::strlen(mStr), std::strlen("this is a test 2"));
+
+    EXPECT_EQ(mStr, nanovdb::util::strcpy(mStr, ""));
+    //std::cerr << "mStr = \"" << mStr << "\"" << std::endl;
+    EXPECT_TRUE(nanovdb::util::streq(mStr,  ""));
+    EXPECT_EQ(nanovdb::util::strlen(mStr), std::strlen(""));
+
+    EXPECT_EQ(mStr, nanovdb::util::strcpy(mStr, 0));
+    //std::cerr << "mStr = \"" << mStr << "\"" << std::endl;
+    EXPECT_TRUE(nanovdb::util::streq(mStr, "0"));
+    EXPECT_EQ(nanovdb::util::strlen(mStr), std::strlen("0"));
+
+    EXPECT_EQ(mStr, nanovdb::util::strcpy(mStr, 1234567));
+    //std::cerr << "mStr = \"" << mStr << "\"" << std::endl;
+    EXPECT_TRUE(nanovdb::util::streq(mStr, "1234567"));
+    EXPECT_EQ(nanovdb::util::strlen(mStr), std::strlen("1234567"));
+
+    EXPECT_EQ(mStr, nanovdb::util::strcpy(mStr, 1234567, 10));
+    //std::cerr << "mStr = \"" << mStr << "\"" << std::endl;
+    EXPECT_TRUE(nanovdb::util::streq(mStr, "1234567"));
+
+    EXPECT_EQ(mStr, nanovdb::util::strcpy(mStr, -123456));
+    //std::cerr << "mStr = \"" << mStr << "\"" << std::endl;
+    EXPECT_TRUE(nanovdb::util::streq(mStr, "-123456"));
+    EXPECT_EQ(nanovdb::util::strlen(mStr), std::strlen("-123456"));
+
+    EXPECT_EQ(mStr, nanovdb::util::strcpy(mStr, 1234567,2));
+    //std::cerr << "mStr = \"" << mStr << "\"" << std::endl;
+    EXPECT_TRUE(nanovdb::util::streq(mStr, "100101101011010000111"));
+    EXPECT_EQ(nanovdb::util::strlen(mStr), std::strlen("100101101011010000111"));
+}// strcpy
+
+// make -j testNanoVDB && ./unittest/testNanoVDB --gtest_filter="*strcat"
+TEST_F(TestNanoVDB, strcat)
+{
+    char str[100];// = {'\0'};// important to null terminate
+    str[0] = '\0';// important to null terminate
+
+    EXPECT_EQ(str, nanovdb::util::strcat(str, "1 "));
+    //std::cerr << "str = \"" << str << "\"" << std::endl;
+    EXPECT_TRUE(nanovdb::util::streq(str,  "1 "));
+
+    EXPECT_EQ(str, nanovdb::util::strcat(str, "2 "));
+    //std::cerr << "str = \"" << str << "\"" << std::endl;
+    EXPECT_TRUE(nanovdb::util::streq(str,  "1 2 "));
+
+    EXPECT_EQ(str, nanovdb::util::strcat(str, ""));
+    //std::cerr << "str = \"" << str << "\"" << std::endl;
+    EXPECT_TRUE(nanovdb::util::streq(str,  "1 2 "));
+
+    EXPECT_EQ(str, nanovdb::util::strcat(str, 0));
+    //std::cerr << "str = \"" << str << "\"" << std::endl;
+    EXPECT_TRUE(nanovdb::util::streq(str, "1 2 0"));
+
+    EXPECT_EQ(str, nanovdb::util::strcat(str, 1234567));
+    //std::cerr << "str = \"" << str << "\"" << std::endl;
+    EXPECT_TRUE(nanovdb::util::streq(str, "1 2 01234567"));
+
+    EXPECT_EQ(str, nanovdb::util::strcat(str, 1234567, 10));
+    //std::cerr << "str = \"" << str << "\"" << std::endl;
+    EXPECT_TRUE(nanovdb::util::streq(str, "1 2 012345671234567"));
+
+    EXPECT_EQ(str, nanovdb::util::strcat(str, -123456));
+    //std::cerr << "str = \"" << str << "\"" << std::endl;
+    EXPECT_TRUE(nanovdb::util::streq(str, "1 2 012345671234567-123456"));
+
+    EXPECT_EQ(str, nanovdb::util::strcat(str, 1234567,2));
+    //std::cerr << "str = \"" << str << "\"" << std::endl;
+    EXPECT_TRUE(nanovdb::util::streq(str, "1 2 012345671234567-123456100101101011010000111"));
+}// strcat
+
+// make -j testNanoVDB && ./unittest/testNanoVDB --gtest_filter="*checkGrid"
+TEST_F(TestNanoVDB, checkGrid)
+{
+    char str[100];
+
+    auto handle = nanovdb::tools::createLevelSetSphere<float>();
+    auto *grid = handle.grid<float>();
+    EXPECT_TRUE(grid);
+
+    nanovdb::tools::checkGrid( nanovdb::util::PtrAdd<nanovdb::FloatGrid>(grid, 1), str);
+    //std::cerr << "str = \"" << str << "\"" << std::endl;
+    EXPECT_TRUE(nanovdb::util::streq(str, "Invalid pointer: Grid is misaligned"));
+
+    grid->mMagic = 0;
+    nanovdb::tools::checkGrid( grid, str);
+    //std::cerr << "str = \"" << str << "\"" << std::endl;
+    EXPECT_TRUE(nanovdb::util::streq(str, "Invalid magic number: unknown"));
+    grid->mMagic = NANOVDB_MAGIC_NUMB;
+
+    grid->mVersion = 0;
+    nanovdb::tools::checkGrid( grid, str);
+    //std::cerr << "str = \"" << str << "\"" << std::endl;
+    EXPECT_TRUE(nanovdb::util::streq(str, "Incompatible version number: 0.0.0"));
+    grid->mVersion = nanovdb::Version();
+
+    grid->mGridCount = 0;
+    nanovdb::tools::checkGrid( grid, str);
+    //std::cerr << "str = \"" << str << "\"" << std::endl;
+    EXPECT_TRUE(nanovdb::util::streq(str, "Zero grid count"));
+    grid->mGridCount = 1;
+
+    grid->mGridIndex = 1;
+    nanovdb::tools::checkGrid( grid, str);
+    //std::cerr << "str = \"" << str << "\"" << std::endl;
+    EXPECT_TRUE(nanovdb::util::streq(str, "grid index(1) >= grid count(1)"));
+    grid->mGridIndex = 0;
+
+    grid->mGridClass = nanovdb::GridClass::End;
+    nanovdb::tools::checkGrid( grid, str);
+    //std::cerr << "str = \"" << str << "\"" << std::endl;
+    EXPECT_TRUE(nanovdb::util::streq(str, "Invalid GridClass(END)"));
+    grid->mGridClass = nanovdb::GridClass::Staggered;
+
+    grid->mGridType = nanovdb::GridType::End;
+    nanovdb::tools::checkGrid( grid, str);
+    //std::cerr << "str = \"" << str << "\"" << std::endl;
+    EXPECT_TRUE(nanovdb::util::streq(str, "Invalid GridType(End)"));
+
+    grid->mGridType = nanovdb::GridType::Vec3f;
+    nanovdb::tools::checkGrid( grid, str);
+    //std::cerr << "str = \"" << str << "\"" << std::endl;
+    EXPECT_TRUE(nanovdb::util::streq(str, "Invalid combination of BuildType(float) and GridType(Vec3f)"));
+
+    grid->mGridType = nanovdb::GridType::Float;
+    nanovdb::tools::checkGrid( grid, str);
+    //std::cerr << "str = \"" << str << "\"" << std::endl;
+    EXPECT_TRUE(nanovdb::util::streq(str, "Invalid combination of GridType(float) and GridClass(MAC)"));
+    grid->mGridClass = nanovdb::GridClass::LevelSet;
+
+    memset(str, 0, 100);
+    nanovdb::tools::checkGrid( grid, str, nanovdb::CheckMode::Full);
+    //nanovdb::tools::checkGrid( grid, str, nanovdb::ChecksumMode::Full);// deprecation warning
+    EXPECT_TRUE(nanovdb::util::empty(str));
+}// checkGrid
+
 int main(int argc, char** argv)
 {
     ::testing::InitGoogleTest(&argc, argv);
diff --git a/nanovdb/nanovdb/unittest/TestNanoVDB.cu b/nanovdb/nanovdb/unittest/TestNanoVDB.cu
index fc88e95d99..0b0ee9eccf 100644
--- a/nanovdb/nanovdb/unittest/TestNanoVDB.cu
+++ b/nanovdb/nanovdb/unittest/TestNanoVDB.cu
@@ -4,39 +4,41 @@
 #include <vector>
 #include <nanovdb/NanoVDB.h>
 #include <nanovdb/util/ForEach.h>
-#include <nanovdb/util/GridBuilder.h>
-#include <nanovdb/util/CreateNanoGrid.h>
-#include <nanovdb/util/Primitives.h>
-#include <nanovdb/util/NodeManager.h>
-#include <nanovdb/util/cuda/CudaUtils.h>
-#include <nanovdb/util/cuda/CudaSignedFloodFill.cuh>
-#include <nanovdb/util/cuda/CudaPointsToGrid.cuh>
-#include <nanovdb/util/cuda/CudaIndexToGrid.cuh>
-#include <nanovdb/util/cuda/CudaAddBlindData.cuh>
-#include <nanovdb/util/cuda/CudaGridChecksum.cuh>
-#include <nanovdb/util/cuda/CudaGridStats.cuh>
-#include <nanovdb/util/cuda/GpuTimer.h>
-#include <nanovdb/util/CpuTimer.h>
-#include <nanovdb/util/IO.h>
+#include <nanovdb/tools/GridBuilder.h>
+#include <nanovdb/tools/CreateNanoGrid.h>
+#include <nanovdb/tools/CreatePrimitives.h>
+#include <nanovdb/NodeManager.h>
+#include <nanovdb/util/cuda/Util.h>
+#include <nanovdb/tools/cuda/SignedFloodFill.cuh>
+#include <nanovdb/tools/cuda/PointsToGrid.cuh>
+#include <nanovdb/tools/cuda/IndexToGrid.cuh>
+#include <nanovdb/tools/cuda/AddBlindData.cuh>
+#include <nanovdb/tools/cuda/GridChecksum.cuh>
+#include <nanovdb/tools/cuda/GridValidator.cuh>
+#include <nanovdb/tools/cuda/GridStats.cuh>
+#include <nanovdb/util/cuda/Timer.h>
+#include <nanovdb/util/Timer.h>
+#include <nanovdb/io/IO.h>
 
 #include <gtest/gtest.h>
 #include <algorithm>// for std::sort
+#include <iomanip> // for std::setw, std::setfill
 
 namespace nanovdb {// this namespace is required by gtest
 
 namespace test {
-// used for testing CudaDeviceBuffer
+// used for testing cuda::DeviceBuffer
 void device2host(size_t count)
 {
     const size_t size = count * sizeof(float);
-    auto buffer = nanovdb::CudaDeviceBuffer::create(size, nullptr, false);// on device only
+    auto buffer = nanovdb::cuda::DeviceBuffer::create(size, nullptr, false);// on device only
     EXPECT_EQ(size, buffer.size());
     EXPECT_FALSE(buffer.data());
     EXPECT_TRUE(buffer.deviceData());
     float *d_array = reinterpret_cast<float*>(buffer.deviceData());
     constexpr unsigned int num_threads = 256;
     unsigned int num_blocks = num_blocks = (static_cast<unsigned int>(count) + num_threads - 1) / num_threads;
-    cudaLambdaKernel<<<num_blocks, num_threads>>>(count, [=] __device__ (size_t i) {d_array[i] = float(i);});
+    nanovdb::util::cuda::lambdaKernel<<<num_blocks, num_threads>>>(count, [=] __device__ (size_t i) {d_array[i] = float(i);});
     buffer.deviceDownload();// copy device -> host
     EXPECT_EQ(size, buffer.size());
     EXPECT_TRUE(buffer.data());
@@ -44,7 +46,7 @@ void device2host(size_t count)
     float *array = reinterpret_cast<float*>(buffer.data());
     for (size_t i=0; i<count; ++i) EXPECT_EQ(array[i], float(i));
 }// device2host
-// used for testing CudaDeviceBuffer
+// used for testing cuda::DeviceBuffer
 void host2device2host(size_t count)
 {
     bool *test, *d_test;
@@ -54,7 +56,7 @@ void host2device2host(size_t count)
     cudaCheck(cudaMemcpyAsync(d_test, test, sizeof(bool), cudaMemcpyHostToDevice));// on host only
 
     const size_t size = count * sizeof(float);
-    auto buffer = nanovdb::CudaDeviceBuffer::create(size);
+    auto buffer = nanovdb::cuda::DeviceBuffer::create(size);
     EXPECT_EQ(size, buffer.size());
     EXPECT_TRUE(buffer.data());
     EXPECT_FALSE(buffer.deviceData());
@@ -67,7 +69,7 @@ void host2device2host(size_t count)
     float *d_array = reinterpret_cast<float*>(buffer.deviceData());
     constexpr unsigned int num_threads = 256;
     unsigned int num_blocks = num_blocks = (static_cast<unsigned int>(count) + num_threads - 1) / num_threads;
-    cudaLambdaKernel<<<num_blocks, num_threads>>>(count, [=] __device__ (size_t i) {
+    nanovdb::util::cuda::lambdaKernel<<<num_blocks, num_threads>>>(count, [=] __device__ (size_t i) {
         if (d_array[i] != float(i)) *d_test = false;
         d_array[i] = float(i) + 1.0f;
     });
@@ -95,25 +97,25 @@ void cudaStr()
     int n, *d_n;
     cudaCheck(cudaMalloc((void**)&d_n, sizeof(int)));
 
-    cudaLambdaKernel<<<1, 1>>>(1, [=] __device__ (size_t) {
-        cudaStrcpy(d_str, "this is a test");
+    nanovdb::util::cuda::lambdaKernel<<<1, 1>>>(1, [=] __device__ (size_t) {
+        nanovdb::util::strcpy(d_str, "this is a test");
     });
     cudaCheck(cudaMemcpy(str, d_str, size, cudaMemcpyDeviceToHost));
     EXPECT_STREQ(str, "this is a test");
-    cudaLambdaKernel<<<1, 1>>>(1, [=] __device__ (size_t) {
-        cudaStrcat(d_str, " #2");
+    nanovdb::util::cuda::lambdaKernel<<<1, 1>>>(1, [=] __device__ (size_t) {
+        nanovdb::util::strcat(d_str, " #2");
     });
     cudaCheck(cudaMemcpy(str, d_str, size, cudaMemcpyDeviceToHost));
     EXPECT_STREQ(str, "this is a test #2");
 
-    cudaLambdaKernel<<<1, 1>>>(1, [=] __device__ (size_t) {
-        *d_n = cudaStrcmp(d_str, "this is a test");
+    nanovdb::util::cuda::lambdaKernel<<<1, 1>>>(1, [=] __device__ (size_t) {
+        *d_n = nanovdb::util::strcmp(d_str, "this is a test");
     });
     cudaCheck(cudaMemcpy(&n, d_n, sizeof(int), cudaMemcpyDeviceToHost));
     //std::cerr << "n = " << n << std::endl;
     EXPECT_EQ(signum(std::strcmp(str, "this is a test")), signum(n));
-    cudaLambdaKernel<<<1, 1>>>(1, [=] __device__ (size_t) {
-        *d_n = cudaStrcmp(d_str, "this is a test #2");
+    nanovdb::util::cuda::lambdaKernel<<<1, 1>>>(1, [=] __device__ (size_t) {
+        *d_n = nanovdb::util::strcmp(d_str, "this is a test #2");
     });
     cudaCheck(cudaMemcpy(&n, d_n, sizeof(int), cudaMemcpyDeviceToHost));
     EXPECT_EQ(std::strcmp(str, "this is a test #2"), n);
@@ -146,7 +148,7 @@ TEST(TestNanoVDBCUDA, Basic_CudaPointsToGrid_float)
     cudaCheck(cudaMalloc(&d_coords, num_points * sizeof(nanovdb::Coord)));
     cudaCheck(cudaMemcpy(d_coords, coords, num_points * sizeof(nanovdb::Coord), cudaMemcpyHostToDevice));// CPU -> GPU
 
-    auto handle = nanovdb::cudaVoxelsToGrid<BuildT>(d_coords, num_points);
+    auto handle = nanovdb::tools::cuda::voxelsToGrid<BuildT>(d_coords, num_points);
     cudaCheck(cudaFree(d_coords));
     EXPECT_TRUE(handle.deviceData());// grid only exists on the GPU
     EXPECT_FALSE(handle.data());// no grid was yet allocated on the CPU
@@ -236,7 +238,7 @@ struct AccessLeafMask<ValueOnIndexMask>{
 TEST(TestNanoVDBCUDA, Basic_CudaPointsToGrid_ValueIndex)
 {
     using BuildT = nanovdb::ValueIndex;
-    using GridT = nanovdb::NanoGrid<BuildT>;
+    using GridT  = nanovdb::NanoGrid<BuildT>;
     const size_t num_points = 3;
     nanovdb::Coord coords[num_points] = {nanovdb::Coord(1, 2, 3),
                                          nanovdb::Coord(1, 2, 4),
@@ -244,10 +246,10 @@ TEST(TestNanoVDBCUDA, Basic_CudaPointsToGrid_ValueIndex)
     cudaCheck(cudaMalloc(&d_coords, num_points * sizeof(nanovdb::Coord)));
     cudaCheck(cudaMemcpy(d_coords, coords, num_points * sizeof(nanovdb::Coord), cudaMemcpyHostToDevice));// CPU -> GPU
 #if 0
-    nanovdb::CudaPointsToGrid converter;
+    nanovdb::tools::cuda::PointsToGrid converter;
     auto handle = converter.getHandle<BuildT>(d_coords, num_points);
 #else
-    auto handle = nanovdb::cudaVoxelsToGrid<BuildT>(d_coords, num_points);
+    auto handle = nanovdb::tools::cuda::voxelsToGrid<BuildT>(d_coords, num_points);
 #endif
     cudaCheck(cudaFree(d_coords));
     EXPECT_TRUE(handle.deviceData());// grid only exists on the GPU
@@ -313,10 +315,10 @@ TEST(TestNanoVDBCUDA, Basic_CudaPointsToGrid_ValueOnIndex)
     cudaCheck(cudaMemcpy(d_coords, coords, num_points * sizeof(nanovdb::Coord), cudaMemcpyHostToDevice));// CPU -> GPU
 
 #if 0
-    nanovdb::CudaPointsToGrid converter;
+    nanovdb::tools::cuda::PointsToGrid converter;
     auto handle = converter.getHandle<BuildT>(d_coords, num_points);
 #else
-    auto handle = nanovdb::cudaVoxelsToGrid<BuildT>(d_coords, num_points);
+    auto handle = nanovdb::tools::cuda::voxelsToGrid<BuildT>(d_coords, num_points);
 #endif
 
     cudaCheck(cudaFree(d_coords));
@@ -411,10 +413,10 @@ TEST(TestNanoVDBCUDA, Basic_CudaPointsToGrid_ValueOnIndexMask)
     cudaCheck(cudaMemcpy(d_coords, coords, num_points * sizeof(nanovdb::Coord), cudaMemcpyHostToDevice));// CPU -> GPU
 
 #if 0
-    nanovdb::CudaPointsToGrid converter;
+    nanovdb::tools::cuda::PointsToGrid converter;
     auto handle = converter.getHandle<BuildT>(d_coords, num_points);
 #else
-    auto handle = nanovdb::cudaVoxelsToGrid<BuildT>(d_coords, num_points);
+    auto handle = nanovdb::tools::cuda::voxelsToGrid<BuildT>(d_coords, num_points);
 #endif
 
     cudaCheck(cudaFree(d_coords));
@@ -509,7 +511,7 @@ TEST(TestNanoVDBCUDA, Basic_CudaPointsToGrid_ValueOnIndexMask)
 TEST(TestNanoVDBCUDA, Large_CudaPointsToGrid_old)
 {
     using BuildT = nanovdb::ValueOnIndex;
-    //nanovdb::CpuTimer timer;
+    //nanovdb::util::Timer timer;
     const size_t voxelCount = 1 << 20;// 1048576
     std::vector<nanovdb::Coord> voxels;
     {//generate random voxels
@@ -524,14 +526,14 @@ TEST(TestNanoVDBCUDA, Large_CudaPointsToGrid_old)
     }
 #if 0
     {// Build grid on CPU
-        nanovdb::build::Grid<float> buildGrid(0.0f);
+        nanovdb::tools::build::Grid<float> buildGrid(0.0f);
         //timer.start("Building grid on CPU from "+std::to_string(voxels.size())+" points");
-        nanovdb::forEach(0, voxelCount, voxelCount >> 6, [&](const nanovdb::Range1D &r){
+        nanovdb::util::forEach0, voxelCount, voxelCount >> 6, [&](const nanovdb::util::Range1D &r){
             auto acc = buildGrid.getWriteAccessor();
             for (size_t i=r.begin(); i!=r.end(); ++i) acc.setValueOn(voxels[i]);
         });
         //timer.restart("Converting CPU build::Grid to nanovdb");
-        auto handle = nanovdb::createNanoGrid(buildGrid);
+        auto handle = nanovdb::tools::createNanoGrid(buildGrid);
         //timer.stop();
     }
 #endif
@@ -544,7 +546,7 @@ TEST(TestNanoVDBCUDA, Large_CudaPointsToGrid_old)
     //timer.stop();
 
     //timer.start("Building grid on GPU from "+std::to_string(voxels.size())+" points");
-    auto handle = nanovdb::cudaVoxelsToGrid<BuildT>(d_coords, voxelCount, 1.0);
+    auto handle = nanovdb::tools::cuda::voxelsToGrid<BuildT>(d_coords, voxelCount, 1.0);
     //timer.stop();
 
     EXPECT_TRUE(handle.deviceData());// grid only exists on the GPU
@@ -568,7 +570,7 @@ TEST(TestNanoVDBCUDA, Large_CudaPointsToGrid_old)
     EXPECT_EQ(nanovdb::Vec3d(1.0), grid->voxelSize());
 
     //timer.restart("Parallel unit-testing on CPU");
-    nanovdb::forEach(voxels,[&](const nanovdb::Range1D &r){
+    nanovdb::util::forEach(voxels,[&](const nanovdb::util::Range1D &r){
         auto acc = grid->getAccessor();
         for (size_t i=r.begin(); i!=r.end(); ++i) {
             const nanovdb::Coord &ijk = voxels[i];
@@ -590,10 +592,10 @@ TEST(TestNanoVDBCUDA, mergeSplitGrids)
     size_t size1 = 0, size2 = 0;
     std::vector<nanovdb::GridHandle<>> handles1, handles2;
     std::vector<std::string> gridNames;
-    //nanovdb::CpuTimer timer("create 5 host grids");
+    //nanovdb::util::Timer timer("create 5 host grids");
     for (int radius = 100; radius<150; radius += 10) {
         gridNames.emplace_back("sphere_" + std::to_string(radius));
-        handles1.emplace_back(nanovdb::createLevelSetSphere(radius,nanovdb::Vec3d(0),1,3,
+        handles1.emplace_back(nanovdb::tools::createLevelSetSphere(radius,nanovdb::Vec3d(0),1,3,
                                                             nanovdb::Vec3d(0), gridNames.back()));
         EXPECT_FALSE(handles1.back().isPadded());
         size1 += handles1.back().size();
@@ -603,7 +605,7 @@ TEST(TestNanoVDBCUDA, mergeSplitGrids)
     //timer.restart("create 5 host grids");
     for (int radius = 150; radius<200; radius += 10) {
         gridNames.emplace_back("sphere_" + std::to_string(radius));
-        handles2.emplace_back(nanovdb::createLevelSetSphere(radius,nanovdb::Vec3d(0),1,3,
+        handles2.emplace_back(nanovdb::tools::createLevelSetSphere(radius,nanovdb::Vec3d(0),1,3,
                                                             nanovdb::Vec3d(0), gridNames.back()));
         size2 += handles2.back().size();
     }
@@ -665,15 +667,15 @@ TEST(TestNanoVDBCUDA, mergeSplitGrids)
 
 TEST(TestNanoVDBCUDA, mergeSplitDeviceGrids)
 {
-    using BufferT = nanovdb::CudaDeviceBuffer;
+    using BufferT = nanovdb::cuda::DeviceBuffer;
     using HandleT = nanovdb::GridHandle<BufferT>;
     size_t size = 0;
     std::vector<HandleT> handles;
     std::vector<std::string> gridNames;
-    //nanovdb::CpuTimer timer("create 10 host grids");
+    //nanovdb::util::Timer timer("create 10 host grids");
     for (int radius = 100; radius<200; radius += 10) {
         gridNames.emplace_back("sphere_" + std::to_string(radius));
-        handles.emplace_back(nanovdb::createLevelSetSphere<float, BufferT>(radius,nanovdb::Vec3d(0),1,3,
+        handles.emplace_back(nanovdb::tools::createLevelSetSphere<float, BufferT>(radius,nanovdb::Vec3d(0),1,3,
                                                            nanovdb::Vec3d(0), gridNames.back()));
         EXPECT_FALSE(handles.back().isPadded());
         size += handles.back().size();
@@ -682,7 +684,7 @@ TEST(TestNanoVDBCUDA, mergeSplitDeviceGrids)
     for (auto &h : handles) h.deviceUpload();
     EXPECT_EQ(10u, handles.size());
     //timer.restart("merging device grids");
-    auto mergedHandle = nanovdb::mergeDeviceGrids<BufferT, std::vector>(handles);
+    auto mergedHandle = nanovdb::cuda::mergeGridHandles<BufferT, std::vector>(handles);
     EXPECT_EQ(size, mergedHandle.size());
     EXPECT_FALSE(mergedHandle.data());
     EXPECT_TRUE(mergedHandle.deviceData());
@@ -704,7 +706,7 @@ TEST(TestNanoVDBCUDA, mergeSplitDeviceGrids)
         EXPECT_EQ(strcmp(gridNames[i].c_str(), gridData->mGridName),0);
     }
     //timer.restart("splitting device grids");
-    auto splitHandles = nanovdb::splitDeviceGrids<BufferT, std::vector>(mergedHandle);
+    auto splitHandles = nanovdb::cuda::splitGridHandles<BufferT, std::vector>(mergedHandle);
     //timer.restart("unit-test split grids");
     EXPECT_EQ(10u, splitHandles.size());
     for (uint32_t i=0u; i<10u; ++i) {
@@ -724,14 +726,14 @@ TEST(TestNanoVDBCUDA, mergeSplitDeviceGrids)
 // make -j 4 testNanoVDB && ./unittest/testNanoVDB --gtest_filter="*Cuda*" --gtest_break_on_failure
 TEST(TestNanoVDBCUDA, CudaIndexGridToGrid_basic)
 {
-    using BufferT = nanovdb::CudaDeviceBuffer;
+    using BufferT = nanovdb::cuda::DeviceBuffer;
     const float value = 1.23456f, backgroud = 1.0f;
     const nanovdb::Coord ijk(1,2,3);
     nanovdb::GridHandle<BufferT> floatHdl;
     nanovdb::FloatGrid *floatGrid = nullptr;
-    //nanovdb::CpuTimer timer;
+    //nanovdb::util::Timer timer;
     {// create float grid with one active voxel
-        nanovdb::build::Grid<float> grid(backgroud);
+        nanovdb::tools::build::Grid<float> grid(backgroud);
         auto srcAcc = grid.getAccessor();
         srcAcc.setValue(ijk, value);
         auto nodeCount = grid.nodeCount();
@@ -741,7 +743,7 @@ TEST(TestNanoVDBCUDA, CudaIndexGridToGrid_basic)
         EXPECT_EQ(value, srcAcc.getValue(ijk));
         EXPECT_EQ(value, srcAcc.getValue(1,2,3));
         //timer.start("Create FloatGrid on CPU");
-        floatHdl = nanovdb::createNanoGrid<nanovdb::build::Grid<float>, float, BufferT>(grid);
+        floatHdl = nanovdb::tools::createNanoGrid<nanovdb::tools::build::Grid<float>, float, BufferT>(grid);
         EXPECT_TRUE(floatHdl);
         floatGrid = floatHdl.grid<float>();
         EXPECT_TRUE(floatGrid);
@@ -758,8 +760,8 @@ TEST(TestNanoVDBCUDA, CudaIndexGridToGrid_basic)
         EXPECT_TRUE(acc.isActive(ijk));
     }
     //timer.restart("Create IndexGrid on CPU");
-    using BufferT = nanovdb::CudaDeviceBuffer;
-    auto idxHdl = nanovdb::createNanoGrid<nanovdb::FloatGrid, nanovdb::ValueIndex, BufferT>(*floatGrid, 0u, false, false, 1);
+    using BufferT = nanovdb::cuda::DeviceBuffer;
+    auto idxHdl = nanovdb::tools::createNanoGrid<nanovdb::FloatGrid, nanovdb::ValueIndex, BufferT>(*floatGrid, 0u, false, false, 1);
     //timer.restart("Copy IndexGrid from CPU to GPU");
     EXPECT_FALSE(idxHdl.deviceGrid<nanovdb::ValueIndex>());
     idxHdl.deviceUpload();
@@ -770,7 +772,7 @@ TEST(TestNanoVDBCUDA, CudaIndexGridToGrid_basic)
     EXPECT_EQ(1u + 512u, idxGrid->valueCount());// background + 512 values in one leaf node
     float *values = new float[idxGrid->valueCount()], *d_values = nullptr;
     values[0] = backgroud;
-    const float *q = floatGrid->tree().getFirstLeaf()->data()->mValues;
+    const float *q = floatGrid->tree().getFirstLeaf()->mValues;
     for (float *p=values+1, *e=p+512;p!=e; ++p) *p = *q++;
     //timer.restart("Allocate and copy values from CPU to GPU");
     cudaCheck(cudaMalloc((void**)&d_values, idxGrid->valueCount()*sizeof(float)));
@@ -780,7 +782,7 @@ TEST(TestNanoVDBCUDA, CudaIndexGridToGrid_basic)
     auto *d_idxGrid = idxHdl.deviceGrid<nanovdb::ValueIndex>();
     EXPECT_TRUE(d_idxGrid);
     //timer.restart("Call CudaIndexToGrid");
-    auto hdl = nanovdb::cudaIndexToGrid<float>(d_idxGrid, d_values);
+    auto hdl = nanovdb::tools::cuda::indexToGrid<float>(d_idxGrid, d_values);
     //timer.restart("unit-test");
     EXPECT_FALSE(hdl.grid<float>());// no host grid
     EXPECT_TRUE(hdl.deviceGrid<float>());
@@ -822,14 +824,14 @@ TEST(TestNanoVDBCUDA, CudaIndexGridToGrid_basic)
 TEST(TestNanoVDBCUDA, CudaIndexGridToGrid_ValueIndex)
 {
     using BuildT = nanovdb::ValueIndex;
-    using BufferT = nanovdb::CudaDeviceBuffer;
-    //nanovdb::CpuTimer timer("Create FloatGrid on CPU");
-    auto floatHdl = nanovdb::createLevelSetSphere<float, BufferT>(100,nanovdb::Vec3d(0),1,3, nanovdb::Vec3d(0), "test");
+    using BufferT = nanovdb::cuda::DeviceBuffer;
+    //nanovdb::util::Timer timer("Create FloatGrid on CPU");
+    auto floatHdl = nanovdb::tools::createLevelSetSphere<float, BufferT>(100,nanovdb::Vec3d(0),1,3, nanovdb::Vec3d(0), "test");
     auto *floatGrid = floatHdl.grid<float>();
     EXPECT_TRUE(floatGrid);
     auto acc = floatGrid->getAccessor();
     //timer.restart("Create IndexGrid on CPU");
-    auto idxHdl = nanovdb::createNanoGrid<nanovdb::FloatGrid, BuildT, BufferT>(*floatGrid);
+    auto idxHdl = nanovdb::tools::createNanoGrid<nanovdb::FloatGrid, BuildT, BufferT>(*floatGrid);
     //timer.restart("Copy IndexGrid from CPU to GPU");
     idxHdl.deviceUpload();
     auto *idxGrid = idxHdl.grid<BuildT>();
@@ -850,7 +852,7 @@ TEST(TestNanoVDBCUDA, CudaIndexGridToGrid_ValueIndex)
     auto *d_idxGrid = idxHdl.deviceGrid<BuildT>();
     EXPECT_TRUE(d_idxGrid);
     //timer.restart("Call CudaIndexToGrid");
-    auto hdl = nanovdb::cudaIndexToGrid<float>(d_idxGrid, d_values);
+    auto hdl = nanovdb::tools::cuda::indexToGrid<float>(d_idxGrid, d_values);
     //timer.restart("unit-test");
     EXPECT_FALSE(hdl.grid<float>());// no host grid
     EXPECT_TRUE(hdl.deviceGrid<float>());
@@ -872,14 +874,14 @@ TEST(TestNanoVDBCUDA, CudaIndexGridToGrid_ValueIndex)
 TEST(TestNanoVDBCUDA, CudaIndexGridToGrid_ValueOnIndex)
 {
     using BuildT = nanovdb::ValueOnIndex;
-    using BufferT = nanovdb::CudaDeviceBuffer;
-    //nanovdb::CpuTimer timer("Create FloatGrid on CPU");
-    auto floatHdl = nanovdb::createLevelSetSphere<float, BufferT>(100,nanovdb::Vec3d(0),1,3, nanovdb::Vec3d(0), "test");
+    using BufferT = nanovdb::cuda::DeviceBuffer;
+    //nanovdb::util::Timer timer("Create FloatGrid on CPU");
+    auto floatHdl = nanovdb::tools::createLevelSetSphere<float, BufferT>(100,nanovdb::Vec3d(0),1,3, nanovdb::Vec3d(0), "test");
     auto *floatGrid = floatHdl.grid<float>();
     EXPECT_TRUE(floatGrid);
     auto acc = floatGrid->getAccessor();
     //timer.restart("Create IndexGrid on CPU");
-    auto idxHdl = nanovdb::createNanoGrid<nanovdb::FloatGrid, BuildT, BufferT>(*floatGrid);
+    auto idxHdl = nanovdb::tools::createNanoGrid<nanovdb::FloatGrid, BuildT, BufferT>(*floatGrid);
     //timer.restart("Copy IndexGrid from CPU to GPU");
     idxHdl.deviceUpload();
     auto *idxGrid = idxHdl.grid<BuildT>();
@@ -902,7 +904,7 @@ TEST(TestNanoVDBCUDA, CudaIndexGridToGrid_ValueOnIndex)
     auto *d_idxGrid = idxHdl.deviceGrid<BuildT>();
     EXPECT_TRUE(d_idxGrid);
     //timer.restart("Call CudaIndexToGrid");
-    auto hdl = nanovdb::cudaIndexToGrid<float>(d_idxGrid, d_values);
+    auto hdl = nanovdb::tools::cuda::indexToGrid<float>(d_idxGrid, d_values);
     //timer.restart("unit-test");
     EXPECT_FALSE(hdl.grid<float>());// no host grid
     EXPECT_TRUE(hdl.deviceGrid<float>());
@@ -923,9 +925,9 @@ TEST(TestNanoVDBCUDA, CudaIndexGridToGrid_ValueOnIndex)
 
 TEST(TestNanoVDBCUDA, CudaSignedFloodFill)
 {
-    using BufferT = nanovdb::CudaDeviceBuffer;
-    //nanovdb::CpuTimer timer("Create FloatGrid on CPU");
-    auto floatHdl = nanovdb::createLevelSetSphere<float, BufferT>(100);
+    using BufferT = nanovdb::cuda::DeviceBuffer;
+    //nanovdb::util::Timer timer("Create FloatGrid on CPU");
+    auto floatHdl = nanovdb::tools::createLevelSetSphere<float, BufferT>(100);
     auto *floatGrid = floatHdl.grid<float>();
     EXPECT_TRUE(floatGrid);
     auto acc = floatGrid->getAccessor();
@@ -946,8 +948,8 @@ TEST(TestNanoVDBCUDA, CudaSignedFloodFill)
     auto *d_floatGrid = floatHdl.deviceGrid<float>();
     EXPECT_TRUE(d_floatGrid);
     //timer.restart("Signed flood-fill on the GPU");
-    //nanovdb::cudaSignedFloodFill(d_floatGrid, true);
-    nanovdb::cudaSignedFloodFill(d_floatGrid);
+    //nanovdb::cuda::signedFloodFill(d_floatGrid, true);
+    nanovdb::tools::cuda::signedFloodFill(d_floatGrid);
     //timer.restart("Copy FloatGrid from GPU to CPU");
     floatHdl.deviceDownload();// GPU -> CPU
     //timer.stop();
@@ -970,8 +972,8 @@ TEST(TestNanoVDBCUDA, OneVoxelToGrid)
     cudaCheck(cudaMalloc(&d_coords, num_points * sizeof(nanovdb::Coord)));
     cudaCheck(cudaMemcpy(d_coords, coords, num_points * sizeof(nanovdb::Coord), cudaMemcpyHostToDevice));// CPU -> GPU
 
-    //nanovdb::GpuTimer timer("Create FloatGrid on GPU");
-    nanovdb::CudaPointsToGrid<BuildT> converter;
+    //nanovdb::util::cuda::Timer timer("Create FloatGrid on GPU");
+    nanovdb::tools::cuda::PointsToGrid<BuildT> converter;
     auto handle = converter.getHandle(d_coords, num_points);
     cudaCheck(cudaFree(d_coords));
     //timer.stop();
@@ -1034,8 +1036,8 @@ TEST(TestNanoVDBCUDA, ThreePointsToGrid)
     cudaCheck(cudaMalloc(&d_points, num_points * sizeof(Vec3T)));
     cudaCheck(cudaMemcpy(d_points, points, num_points * sizeof(Vec3T), cudaMemcpyHostToDevice));// CPU -> GPU
 
-    //nanovdb::GpuTimer timer("Create FloatGrid on GPU");
-    nanovdb::CudaPointsToGrid<BuildT> converter;
+    //nanovdb::util::cuda::Timer timer("Create FloatGrid on GPU");
+    nanovdb::tools::cuda::PointsToGrid<BuildT> converter;
     auto handle = converter.getHandle(d_points, num_points);
     cudaCheck(cudaFree(d_points));
     //timer.stop();
@@ -1150,8 +1152,8 @@ TEST(TestNanoVDBCUDA, EightVoxelsToFloatGrid)
     cudaCheck(cudaMalloc(&d_coords, num_points * sizeof(nanovdb::Coord)));
     cudaCheck(cudaMemcpy(d_coords, coords, num_points * sizeof(nanovdb::Coord), cudaMemcpyHostToDevice));// CPU -> GPU
 
-    //nanovdb::GpuTimer timer("Create FloatGrid on GPU");
-    nanovdb::CudaPointsToGrid<BuildT> converter;
+    //nanovdb::util::cuda::Timer timer("Create FloatGrid on GPU");
+    nanovdb::tools::cuda::PointsToGrid<BuildT> converter;
     auto handle = converter.getHandle(d_coords, num_points);
     //timer.stop();
     cudaCheck(cudaFree(d_coords));
@@ -1210,7 +1212,7 @@ TEST(TestNanoVDBCUDA, Random_CudaPointsToGrid_World64)
 {
     using BuildT = nanovdb::Point;//uint32_t;
     using Vec3T = nanovdb::Vec3d;
-    //nanovdb::CpuTimer timer;
+    //nanovdb::util::Timer timer;
     const size_t pointCount = 1 << 20;// 1048576
     std::vector<Vec3T> points;
     //generate random points
@@ -1233,7 +1235,7 @@ TEST(TestNanoVDBCUDA, Random_CudaPointsToGrid_World64)
 
     const double voxelSize = 8.0;
     //timer.start("Building grid on GPU from "+std::to_string(points.size())+" points");
-    nanovdb::CudaPointsToGrid<BuildT> converter(voxelSize);// unit map
+    nanovdb::tools::cuda::PointsToGrid<BuildT> converter(voxelSize);// unit map
     //converter.setVerbose();
     auto handle = converter.getHandle(d_points, pointCount);
     //timer.stop();
@@ -1294,7 +1296,7 @@ TEST(TestNanoVDBCUDA, Random_CudaPointsToGrid_World64)
     }
 
     //timer.restart("Parallel unit-testing on CPU");
-    nanovdb::forEach(points,[&](const nanovdb::Range1D &r){
+    nanovdb::util::forEach(points,[&](const nanovdb::util::Range1D &r){
         nanovdb::PointAccessor<Vec3T, BuildT> acc(*grid);
         EXPECT_TRUE(acc);
         const Vec3T *start = nullptr, *stop = nullptr;
@@ -1321,11 +1323,12 @@ TEST(TestNanoVDBCUDA, Random_CudaPointsToGrid_World64)
     //timer.stop();
 }// Random_CudaPointsToGrid_World64
 
+
 TEST(TestNanoVDBCUDA, Large_CudaPointsToGrid_World64)
 {
     using BuildT = nanovdb::Point;
     using Vec3T  = nanovdb::Vec3d;
-    //nanovdb::CpuTimer timer;
+    //nanovdb::util::Timer timer;
     const size_t pointCount = 1 << 20;// 1048576
     std::vector<Vec3T> points;
     //generate random points
@@ -1348,7 +1351,7 @@ TEST(TestNanoVDBCUDA, Large_CudaPointsToGrid_World64)
 
     const double voxelSize = 8.0;
     //timer.start("Building grid on GPU from "+std::to_string(points.size())+" points");
-    nanovdb::CudaPointsToGrid<BuildT> converter(voxelSize);// unit map
+    nanovdb::tools::cuda::PointsToGrid<BuildT> converter(voxelSize);// fixed voxel size
     //converter.setVerbose();
     auto handle = converter.getHandle(d_points, pointCount);
     //timer.stop();
@@ -1411,7 +1414,7 @@ TEST(TestNanoVDBCUDA, Large_CudaPointsToGrid_World64)
     }
 
     //timer.restart("Parallel unit-testing on CPU");
-    nanovdb::forEach(points,[&](const nanovdb::Range1D &r){
+    nanovdb::util::forEach(points,[&](const nanovdb::util::Range1D &r){
         nanovdb::PointAccessor<Vec3T, BuildT> acc(*grid);
         EXPECT_TRUE(acc);
         const Vec3T *start = nullptr, *stop = nullptr;
@@ -1432,7 +1435,7 @@ TEST(TestNanoVDBCUDA, Large_CudaPointsToGrid_World64)
             bool test = false;
             for (uint64_t j=0; test == false && j<count; ++j) {
                 const nanovdb::Vec3d &xyz = start[j];
-                test = nanovdb::isApproxZero<double>( (points[i] - xyz).lengthSqr() );
+                test = nanovdb::math::isApproxZero<double>( (points[i] - xyz).lengthSqr() );
             }
             EXPECT_TRUE(test);
         }
@@ -1441,13 +1444,132 @@ TEST(TestNanoVDBCUDA, Large_CudaPointsToGrid_World64)
     //timer.stop();
 }// Large_CudaPointsToGrid_World64
 
+TEST(TestNanoVDBCUDA, Large_CudaPointsToGrid_World64_density)
+{// unlike the previous unit-test this one selects the dx to match a specefic point density
+    using BuildT = nanovdb::Point;
+    using Vec3T  = nanovdb::Vec3d;
+    //nanovdb::util::Timer timer;
+    const size_t pointCount = 1 << 20;// 1048576
+    std::vector<Vec3T> points;
+    //generate random points
+    points.reserve(pointCount);
+    std::srand(98765);
+    const int max = 512, min = -max;
+    auto op = [&](){return rand() % (max - min) + min;};
+    //timer.start("Creating "+std::to_string(pointCount)+" random points on the CPU");
+    while (points.size() < pointCount) points.emplace_back(op(), op(), op());
+    //timer.stop();
+    EXPECT_EQ(pointCount, points.size());
+    Vec3T* d_points;
+    const size_t pointSize = points.size() * sizeof(Vec3T);
+    //std::cerr << "Point footprint: " << (pointSize >> 20) << " MB" << std::endl;
+    //timer.start("Allocating "+std::to_string(pointSize >> 20)+" MB on the GPU");
+    cudaCheck(cudaMalloc(&d_points, pointSize));
+    //timer.restart("Copying points from CPU to GPU");
+    cudaCheck(cudaMemcpy(d_points, points.data(), pointSize, cudaMemcpyHostToDevice));
+    //timer.stop();
+
+    const int targetPointsPerVoxel = 60, tolerance = 1;
+    //timer.start("Building grid on GPU from "+std::to_string(points.size())+" points");
+    nanovdb::tools::cuda::PointsToGrid<BuildT> converter(targetPointsPerVoxel, tolerance);// fixed density
+    //converter.setVerbose(2);
+    auto handle = converter.getHandle(d_points, pointCount);
+    //timer.stop();
+    cudaCheck(cudaFree(d_points));
+    //std::cerr << "Grid size: " << (handle.size() >> 20) << " MB" << std::endl;
+
+    const uint32_t maxPointsPerVoxel = converter.maxPointsPerVoxel();
+    const uint32_t maxPointsPerLeaf  = converter.maxPointsPerLeaf();
+    EXPECT_NEAR(maxPointsPerVoxel, targetPointsPerVoxel, tolerance);
+    EXPECT_LE(maxPointsPerLeaf, targetPointsPerVoxel*512);
+    //std::cerr << "maxPointsPerLeaf = " << maxPointsPerLeaf << " maxPointsPerVoxel = " << maxPointsPerVoxel << std::endl;
+
+    EXPECT_TRUE(handle.deviceData());// grid only exists on the GPU
+    EXPECT_TRUE(handle.deviceGrid<BuildT>());
+    EXPECT_FALSE(handle.deviceGrid<int>(0));
+    EXPECT_TRUE(handle.deviceGrid<BuildT>(0));
+    EXPECT_FALSE(handle.deviceGrid<BuildT>(1));
+    EXPECT_FALSE(handle.data());// no grid was yet allocated on the CPU
+
+    //timer.start("Allocating and copying grid from GPU to CPU");
+    auto *grid = handle.grid<BuildT>();// no grid on the CPU
+    EXPECT_FALSE(grid);
+    handle.deviceDownload();// creates a copy on the CPU
+    EXPECT_TRUE(handle.deviceData());
+    EXPECT_TRUE(handle.data());
+    auto *data = handle.gridData();
+    EXPECT_TRUE(data);
+    grid = handle.grid<BuildT>();
+    EXPECT_TRUE(grid);
+    //EXPECT_TRUE(grid->isLexicographic());
+    EXPECT_TRUE(grid->isBreadthFirst());
+    //EXPECT_EQ(nanovdb::Vec3d(voxelSize), grid->voxelSize());
+    EXPECT_EQ(pointCount, grid->pointCount());
+    EXPECT_TRUE(nanovdb::CoordBBox::createCube(min, max-1).isInside(grid->indexBBox()));
+    //std::cerr << grid->indexBBox() << std::endl;
+
+    EXPECT_STREQ("World64: Vec3<double> point coordinates in world space", grid->blindMetaData(0).mName);
+    {
+        auto mgrHdl = nanovdb::createNodeManager(*grid);
+        auto *mgr = mgrHdl.mgr<BuildT>();
+        EXPECT_TRUE(mgr);
+        for (uint32_t i=0; i<mgr->leafCount(); ++i) {
+            const auto &leaf = mgr->leaf(i);
+            for (int j=0; j<512; ++j) {
+                EXPECT_LE(leaf.getValue(j), maxPointsPerLeaf);
+                if (leaf.isActive(j)) {
+                    if (j>0) {
+                        EXPECT_LE(leaf.getValue(j) - leaf.getValue(j-1), maxPointsPerVoxel + tolerance);
+                    } else {
+                        EXPECT_LE(leaf.getValue(0), maxPointsPerVoxel);
+                    }
+                } else if (j>0) {
+                    EXPECT_EQ(leaf.getValue(j), leaf.getValue(j-1));
+                } else {
+                    EXPECT_EQ(leaf.getValue(0), 0u);
+                }
+            }// loop over voxels
+        }// loop over leaf nodes
+    }
+
+    //timer.restart("Parallel unit-testing on CPU");
+    nanovdb::util::forEach(points,[&](const nanovdb::util::Range1D &r){
+        nanovdb::PointAccessor<Vec3T, BuildT> acc(*grid);
+        EXPECT_TRUE(acc);
+        const Vec3T *start = nullptr, *stop = nullptr;
+        for (size_t i=r.begin(); i!=r.end(); ++i) {
+            const nanovdb::Coord ijk = grid->worldToIndex(points[i]).round();
+            EXPECT_TRUE(acc.probeLeaf(ijk)!=nullptr);
+            EXPECT_TRUE(acc.isActive(ijk));
+            EXPECT_LE(acc.getValue(ijk), pointCount);
+            const auto *leaf = acc.get<nanovdb::GetLeaf<BuildT>>(ijk);
+            EXPECT_TRUE(leaf);
+            const auto offset = leaf->CoordToOffset(ijk);
+            EXPECT_EQ(ijk, leaf->offsetToGlobalCoord(offset));
+            const uint64_t count = acc.voxelPoints(ijk, start, stop);
+            EXPECT_TRUE(start);
+            EXPECT_TRUE(stop);
+            EXPECT_LT(start, stop);
+            EXPECT_LE(count, maxPointsPerVoxel + tolerance);
+            bool test = false;
+            for (uint64_t j=0; test == false && j<count; ++j) {
+                const nanovdb::Vec3d &xyz = start[j];
+                test = nanovdb::math::isApproxZero<double>( (points[i] - xyz).lengthSqr() );
+            }
+            EXPECT_TRUE(test);
+        }
+    });
+
+    //timer.stop();
+}// Large_CudaPointsToGrid_World64_density
+
 TEST(TestNanoVDBCUDA, Sphere_CudaPointsToGrid_World32)
 {
     using BuildT = nanovdb::Point;
     using Vec3T  = nanovdb::Vec3f;
 
-    //nanovdb::CpuTimer timer("Generate sphere with points");
-    auto pointsHandle = nanovdb::createPointSphere(8, 100.0, nanovdb::Vec3d(0.0), 0.5);
+    //nanovdb::util::Timer timer("Generate sphere with points");
+    auto pointsHandle = nanovdb::tools::createPointSphere(8, 100.0, nanovdb::Vec3d(0.0), 0.5);
     //timer.stop();
 
     auto *pointGrid = pointsHandle.grid<uint32_t>();
@@ -1473,7 +1595,7 @@ TEST(TestNanoVDBCUDA, Sphere_CudaPointsToGrid_World32)
     //timer.stop();
 
     //timer.start("Building grid on GPU from "+std::to_string(pointCount)+" points");
-    nanovdb::CudaPointsToGrid<BuildT> converter(pointGrid->map());
+    nanovdb::tools::cuda::PointsToGrid<BuildT> converter(pointGrid->map());
     //converter.setVerbose();
     auto handle = converter.getHandle(d_points, pointCount);
     //timer.stop();
@@ -1535,7 +1657,7 @@ TEST(TestNanoVDBCUDA, Sphere_CudaPointsToGrid_World32)
     }
 
     //timer.restart("Parallel unit-testing on CPU");
-    nanovdb::forEach(0u, pointCount, 1u,[&](const nanovdb::Range1D &r){
+    nanovdb::util::forEach(0u, pointCount, 1u,[&](const nanovdb::util::Range1D &r){
         nanovdb::PointAccessor<Vec3T, BuildT> acc(*grid);
         EXPECT_TRUE(acc);
         const Vec3T *start = nullptr, *stop = nullptr;
@@ -1570,8 +1692,8 @@ TEST(TestNanoVDBCUDA, Sphere_CudaPointsToGrid_Voxel32)
     using BuildT = nanovdb::Point;
     using Vec3T  = nanovdb::Vec3f;
 
-    //nanovdb::CpuTimer timer("Generate sphere with points");
-    auto pointsHandle = nanovdb::createPointSphere(8, 100.0, nanovdb::Vec3d(0.0), 0.5);
+    //nanovdb::util::Timer timer("Generate sphere with points");
+    auto pointsHandle = nanovdb::tools::createPointSphere(8, 100.0, nanovdb::Vec3d(0.0), 0.5);
     //timer.stop();
 
     auto *pointGrid = pointsHandle.grid<uint32_t>();
@@ -1598,7 +1720,7 @@ TEST(TestNanoVDBCUDA, Sphere_CudaPointsToGrid_Voxel32)
 
     //timer.start("Building grid on GPU from "+std::to_string(pointCount)+" points");
     /////////////////////////////////////////////////////////////////////////
-    nanovdb::CudaPointsToGrid<BuildT> converter(pointGrid->map());
+    nanovdb::tools::cuda::PointsToGrid<BuildT> converter(pointGrid->map());
     //converter.setVerbose();
     converter.setPointType(nanovdb::PointType::Voxel32);
     auto handle = converter.getHandle(d_points, pointCount);
@@ -1662,7 +1784,7 @@ TEST(TestNanoVDBCUDA, Sphere_CudaPointsToGrid_Voxel32)
     }
 
     //timer.restart("Parallel unit-testing on CPU");
-    nanovdb::forEach(0u, pointCount, 1u,[&](const nanovdb::Range1D &r){
+    nanovdb::util::forEach(0u, pointCount, 1u,[&](const nanovdb::util::Range1D &r){
         nanovdb::PointAccessor<Vec3T, BuildT> acc(*grid);
         EXPECT_TRUE(acc);
         const Vec3T *start = nullptr, *stop = nullptr;
@@ -1704,8 +1826,8 @@ TEST(TestNanoVDBCUDA, Sphere_CudaPointsToGrid_Voxel16)
     using BuildT = nanovdb::Point;
     using Vec3T  = nanovdb::Vec3f;
 
-    //nanovdb::CpuTimer timer("Generate sphere with points");
-    auto pointsHandle = nanovdb::createPointSphere(8, 100.0, nanovdb::Vec3d(0.0), 0.5);
+    //nanovdb::util::Timer timer("Generate sphere with points");
+    auto pointsHandle = nanovdb::tools::createPointSphere(8, 100.0, nanovdb::Vec3d(0.0), 0.5);
     //timer.stop();
 
     auto *pointGrid = pointsHandle.grid<uint32_t>();
@@ -1732,7 +1854,7 @@ TEST(TestNanoVDBCUDA, Sphere_CudaPointsToGrid_Voxel16)
 
     //timer.start("Building grid on GPU from "+std::to_string(pointCount)+" points");
     /////////////////////////////////////////////////////////////////////////
-    nanovdb::CudaPointsToGrid<BuildT> converter(pointGrid->map());
+    nanovdb::tools::cuda::PointsToGrid<BuildT> converter(pointGrid->map());
     //converter.setVerbose();
     converter.setPointType(nanovdb::PointType::Voxel16);
     auto handle = converter.getHandle(d_points, pointCount);
@@ -1796,7 +1918,7 @@ TEST(TestNanoVDBCUDA, Sphere_CudaPointsToGrid_Voxel16)
     }
 
     //timer.restart("Parallel unit-testing on CPU");
-    nanovdb::forEach(0u, pointCount, 1u,[&](const nanovdb::Range1D &r){
+    nanovdb::util::forEach(0u, pointCount, 1u,[&](const nanovdb::util::Range1D &r){
         nanovdb::PointAccessor<nanovdb::Vec3u16, BuildT> acc(*grid);
         EXPECT_TRUE(acc);
         const nanovdb::Vec3u16 *start = nullptr, *stop = nullptr;
@@ -1831,8 +1953,8 @@ TEST(TestNanoVDBCUDA, Sphere_CudaPointsToGrid_Voxel8)
     using BuildT = nanovdb::Point;
     using Vec3T  = nanovdb::Vec3f;
 
-    //nanovdb::CpuTimer timer("Generate sphere with points");
-    auto pointsHandle = nanovdb::createPointSphere(8, 100.0, nanovdb::Vec3d(0.0), 0.5);
+    //nanovdb::util::Timer timer("Generate sphere with points");
+    auto pointsHandle = nanovdb::tools::createPointSphere(8, 100.0, nanovdb::Vec3d(0.0), 0.5);
     //timer.stop();
 
     auto *pointGrid = pointsHandle.grid<uint32_t>();
@@ -1861,7 +1983,7 @@ TEST(TestNanoVDBCUDA, Sphere_CudaPointsToGrid_Voxel8)
     //timer.start("Building grid on GPU from "+std::to_string(pointCount)+" points");
     /////////////////////////////////////////////////////////////////////////
     //auto handle = nanovdb::cudaPointsToGrid(d_points, pointCount, nanovdb::PointType::Voxel8);
-    nanovdb::CudaPointsToGrid<BuildT> converter(pointGrid->map());
+    nanovdb::tools::cuda::PointsToGrid<BuildT> converter(pointGrid->map());
     //converter.setVerbose();
     converter.setPointType(nanovdb::PointType::Voxel8);
     auto handle = converter.getHandle(d_points, pointCount);
@@ -1925,7 +2047,7 @@ TEST(TestNanoVDBCUDA, Sphere_CudaPointsToGrid_Voxel8)
     }
 
     //timer.restart("Parallel unit-testing on CPU");
-    nanovdb::forEach(0u, pointCount, 1u,[&](const nanovdb::Range1D &r){
+    nanovdb::util::forEach(0u, pointCount, 1u,[&](const nanovdb::util::Range1D &r){
         nanovdb::PointAccessor<nanovdb::Vec3u8, BuildT> acc(*grid);
         EXPECT_TRUE(acc);
         const nanovdb::Vec3u8 *start = nullptr, *stop = nullptr;
@@ -1960,8 +2082,8 @@ TEST(TestNanoVDBCUDA, Sphere_CudaPointsToGrid_PointID)
     using BuildT = nanovdb::Point;
     using Vec3T  = nanovdb::Vec3f;
 
-    //nanovdb::CpuTimer timer("Generate sphere with points");
-    auto pointsHandle = nanovdb::createPointSphere(8, 100.0, nanovdb::Vec3d(0.0), 0.5);
+    //nanovdb::util::Timer timer("Generate sphere with points");
+    auto pointsHandle = nanovdb::tools::createPointSphere(8, 100.0, nanovdb::Vec3d(0.0), 0.5);
     //timer.stop();
 
     auto *pointGrid = pointsHandle.grid<uint32_t>();
@@ -1990,7 +2112,7 @@ TEST(TestNanoVDBCUDA, Sphere_CudaPointsToGrid_PointID)
     //timer.start("Building grid on GPU from "+std::to_string(pointCount)+" points");
     /////////////////////////////////////////////////////////////////////////
     //auto handle = nanovdb::cudaPointsToGrid(d_points, pointCount, nanovdb::PointType::Voxel8);
-    nanovdb::CudaPointsToGrid<BuildT> converter(pointGrid->map());
+    nanovdb::tools::cuda::PointsToGrid<BuildT> converter(pointGrid->map());
     //converter.setVerbose(2);
     converter.setPointType(nanovdb::PointType::PointID);
     auto handle = converter.getHandle(d_points, pointCount);
@@ -2054,7 +2176,7 @@ TEST(TestNanoVDBCUDA, Sphere_CudaPointsToGrid_PointID)
     }
 
     //timer.restart("Parallel unit-testing on CPU");
-    nanovdb::forEach(0u, pointCount, 1u,[&](const nanovdb::Range1D &r){
+    nanovdb::util::forEach(0u, pointCount, 1u,[&](const nanovdb::util::Range1D &r){
         nanovdb::PointAccessor<uint32_t, BuildT> acc(*grid);
         EXPECT_TRUE(acc);
         const uint32_t *start = nullptr, *stop = nullptr;
@@ -2080,14 +2202,14 @@ TEST(TestNanoVDBCUDA, Sphere_CudaPointsToGrid_PointID)
 
 TEST(TestNanoVDBCUDA, NanoGrid_Rgba8)
 {
-    using BuildT = nanovdb::Rgba8;
+    using BuildT = nanovdb::math::Rgba8;
     using GridT  = nanovdb::NanoGrid<BuildT>;
     const size_t num_points = 1;
     nanovdb::Coord coords[num_points] = {nanovdb::Coord(1, 2, 3)}, *d_coords = nullptr;
     cudaCheck(cudaMalloc(&d_coords, num_points * sizeof(nanovdb::Coord)));
     cudaCheck(cudaMemcpy(d_coords, coords, num_points * sizeof(nanovdb::Coord), cudaMemcpyHostToDevice));// CPU -> GPU
 
-    nanovdb::CudaPointsToGrid<BuildT> converter;
+    nanovdb::tools::cuda::PointsToGrid<BuildT> converter;
     auto handle = converter.getHandle(d_coords, num_points);
     cudaCheck(cudaFree(d_coords));
 
@@ -2121,7 +2243,7 @@ TEST(TestNanoVDBCUDA, cudaAddBlindData)
     nanovdb::Coord coords[num_points] = {nanovdb::Coord(1, 2, 3), nanovdb::Coord(10,20,8)}, *d_coords = nullptr;
     cudaCheck(cudaMalloc(&d_coords, num_points * sizeof(nanovdb::Coord)));
     cudaCheck(cudaMemcpy(d_coords, coords, num_points * sizeof(nanovdb::Coord), cudaMemcpyHostToDevice));// CPU -> GPU
-    auto handle = nanovdb::cudaVoxelsToGrid<BuildT>(d_coords, num_points);
+    auto handle = nanovdb::tools::cuda::voxelsToGrid<BuildT>(d_coords, num_points);
     cudaCheck(cudaFree(d_coords));
     EXPECT_TRUE(handle.deviceData());// grid only exists on the GPU
     EXPECT_FALSE(handle.data());// no grid was yet allocated on the CPU
@@ -2138,13 +2260,13 @@ TEST(TestNanoVDBCUDA, cudaAddBlindData)
     cudaCheck(cudaMalloc(&d_blind, num_points * sizeof(float)));
     cudaCheck(cudaMemcpy(d_blind, blind, num_points * sizeof(float), cudaMemcpyHostToDevice));// CPU -> GPU
 
-    //nanovdb::GpuTimer timer("cudaAddBlindData");
-    auto handle2 = nanovdb::cudaAddBlindData(d_grid, d_blind, num_points);
+    //nanovdb::util::cuda::Timer timer("cudaAddBlindData");
+    auto handle2 = nanovdb::tools::cuda::addBlindData(d_grid, d_blind, num_points);
     cudaCheck(cudaFree(d_blind));
     //timer.stop();
     EXPECT_TRUE(handle2.deviceData());// grid only exists on the GPU
     EXPECT_FALSE(handle2.data());// no grid was yet allocated on the CPU
-    EXPECT_EQ(handle2.size(), handle.size() + sizeof(nanovdb::GridBlindMetaData) + nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(num_points*sizeof(float)));
+    EXPECT_EQ(handle2.size(), handle.size() + sizeof(nanovdb::GridBlindMetaData) + nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(num_points*sizeof(float)));
 
     auto *grid2 = handle2.grid<BuildT>();// no grid on the CPU
     EXPECT_FALSE(grid2);
@@ -2174,7 +2296,7 @@ TEST(TestNanoVDBCUDA, cudaAddBlindData)
     cudaCheck(cudaMalloc(&d_blind2, num_points * sizeof(nanovdb::Vec3f)));
     cudaCheck(cudaMemcpy(d_blind2, blind2, num_points * sizeof(nanovdb::Vec3f), cudaMemcpyHostToDevice));// CPU -> GPU
 
-    auto handle3 = nanovdb::cudaAddBlindData(d_grid2, d_blind2, num_points,
+    auto handle3 = nanovdb::tools::cuda::addBlindData(d_grid2, d_blind2, num_points,
                                              nanovdb::GridBlindDataClass::AttributeArray,
                                              nanovdb::GridBlindDataSemantic::PointPosition,
                                              "this is a test");
@@ -2207,7 +2329,7 @@ TEST(TestNanoVDBCUDA, cudaAddBlindData)
 
 TEST(TestNanoVDBCUDA, testGridHandleCopy)
 {
-    auto cudaHandle = nanovdb::createLevelSetSphere<float, nanovdb::CudaDeviceBuffer>(100);
+    auto cudaHandle = nanovdb::tools::createLevelSetSphere<float, nanovdb::cuda::DeviceBuffer>(100);
     {
         auto *floatGrid = cudaHandle.grid<float>();
         EXPECT_TRUE(floatGrid);
@@ -2231,13 +2353,18 @@ TEST(TestNanoVDBCUDA, testGridHandleCopy)
 // make -j testNanoVDB && ./unittest/testNanoVDB --gtest_break_on_failure --gtest_filter="*compareNodeOrdering"
 TEST(TestNanoVDBCUDA, compareNodeOrdering)
 {
-    using namespace nanovdb;
 #if 0
     const int voxelCount = 2;
     Coord coords[voxelCount]={Coord(-1,0,0), Coord(0,0,0)};
 #else
     const int voxelCount = 5;
-    Coord coords[voxelCount]={Coord(0,0,0), Coord(256,0,0), Coord(0,0,8), Coord(0,-256,0), Coord(0,2,4)};
+    nanovdb::Coord coords[voxelCount]={
+        nanovdb::Coord(0,0,0),
+        nanovdb::Coord(256,0,0),
+        nanovdb::Coord(0,0,8),
+        nanovdb::Coord(0,-256,0),
+        nanovdb::Coord(0,2,4)
+    };
 #endif
 
     {// check coordToKey and keyToCoord used in CudaPointsToGrid
@@ -2268,13 +2395,13 @@ TEST(TestNanoVDBCUDA, compareNodeOrdering)
         }
     }
 
-    GridHandle<HostBuffer> handle1, handle2;
+    nanovdb::GridHandle<nanovdb::HostBuffer> handle1, handle2;
 
     {
-        build::FloatGrid grid(0.0f);
+        nanovdb::tools::build::FloatGrid grid(0.0f);
         auto acc = grid.getAccessor();
         for (int i=0; i<voxelCount; ++i) acc.setValue(coords[i], 1.0f);
-        handle1 = createNanoGrid(grid);
+        handle1 = nanovdb::tools::createNanoGrid(grid);
     }
     auto grid1 = handle1.grid<float>();
     EXPECT_TRUE(grid1);
@@ -2299,13 +2426,13 @@ TEST(TestNanoVDBCUDA, compareNodeOrdering)
     }
 
     {
-        Coord *d_coords = nullptr;
-        cudaCheck(cudaMalloc(&d_coords, voxelCount * sizeof(Coord)));
-        cudaCheck(cudaMemcpy(d_coords, coords, voxelCount * sizeof(Coord), cudaMemcpyHostToDevice));// CPU -> GPU
+        nanovdb::Coord *d_coords = nullptr;
+        cudaCheck(cudaMalloc(&d_coords, voxelCount * sizeof(nanovdb::Coord)));
+        cudaCheck(cudaMemcpy(d_coords, coords, voxelCount * sizeof(nanovdb::Coord), cudaMemcpyHostToDevice));// CPU -> GPU
 #if 0
-        auto cudaHandle = cudaVoxelsToGrid<float>(d_coords, voxelCount);
+        auto cudaHandle = nanovdb::tools::cuda::voxelsToGrid<float>(d_coords, voxelCount);
 #else
-        auto cudaHandle = cudaVoxelsToGrid<float>(nanovdb::make_fancy(d_coords), voxelCount);
+        auto cudaHandle = nanovdb::tools::cuda::voxelsToGrid<float>(nanovdb::make_fancy(d_coords), voxelCount);
 #endif
         cudaCheck(cudaFree(d_coords));
         cudaHandle.deviceDownload();
@@ -2365,7 +2492,7 @@ template <typename PtrT>
 void test_ptr(const PtrT ptr)
 {
     using T = typename nanovdb::pointer_traits<PtrT>::element_type;
-    static const bool test = nanovdb::is_same<float, typename nanovdb::remove_const<T>::type>::value;
+    static const bool test = nanovdb::util::is_same<float, typename nanovdb::util::remove_const<T>::type>::value;
     EXPECT_TRUE(test);
     EXPECT_EQ(sizeof(float), nanovdb::pointer_traits<PtrT>::element_size);
     EXPECT_EQ(3.14f, *ptr);
@@ -2380,34 +2507,34 @@ TEST(TestNanoVDBCUDA, fancy_ptr)
     EXPECT_EQ(sizeof(uint8_t), nanovdb::pointer_traits<nanovdb::fancy_ptr<uint8_t>>::element_size);
 
     {// test raw pointer
-        bool test = nanovdb::is_same<nanovdb::pointer_traits<float*>::element_type, float>::value;
+        bool test = nanovdb::util::is_same<nanovdb::pointer_traits<float*>::element_type, float>::value;
         EXPECT_TRUE(test);
-        test = nanovdb::is_same<nanovdb::pointer_traits<const float*>::element_type, const float>::value;
+        test = nanovdb::util::is_same<nanovdb::pointer_traits<const float*>::element_type, const float>::value;
         EXPECT_TRUE(test);
         EXPECT_EQ(sizeof(float),  nanovdb::pointer_traits<float*>::element_size);
         EXPECT_EQ(sizeof(float),  nanovdb::pointer_traits<const float*>::element_size);
     }
     {// test std::shared_ptr<float>
-        bool test = nanovdb::is_same<nanovdb::pointer_traits<std::shared_ptr<float>>::element_type, float>::value;
+        bool test = nanovdb::util::is_same<nanovdb::pointer_traits<std::shared_ptr<float>>::element_type, float>::value;
         EXPECT_TRUE(test);
-        test = nanovdb::is_same<nanovdb::pointer_traits<std::shared_ptr<const float>>::element_type, const float>::value;
+        test = nanovdb::util::is_same<nanovdb::pointer_traits<std::shared_ptr<const float>>::element_type, const float>::value;
         EXPECT_TRUE(test);
         EXPECT_EQ(sizeof(float),  nanovdb::pointer_traits<std::shared_ptr<float>>::element_size);
         EXPECT_EQ(sizeof(float),  nanovdb::pointer_traits<std::shared_ptr<const float>>::element_size);
     }
     {// test std::unique_ptr<float>
-        bool test = nanovdb::is_same<nanovdb::pointer_traits<std::unique_ptr<float>>::element_type, float>::value;
+        bool test = nanovdb::util::is_same<nanovdb::pointer_traits<std::unique_ptr<float>>::element_type, float>::value;
         EXPECT_TRUE(test);
-        test = nanovdb::is_same<nanovdb::pointer_traits<std::unique_ptr<const float>>::element_type, const float>::value;
+        test = nanovdb::util::is_same<nanovdb::pointer_traits<std::unique_ptr<const float>>::element_type, const float>::value;
         EXPECT_TRUE(test);
         EXPECT_EQ(sizeof(float),  nanovdb::pointer_traits<std::unique_ptr<float>>::element_size);
         EXPECT_EQ(sizeof(float),  nanovdb::pointer_traits<std::unique_ptr<const float>>::element_size);
     }
     {// test fancy_ptr<float>
-        bool test = nanovdb::is_same<nanovdb::pointer_traits<nanovdb::fancy_ptr<float>>::element_type, const float>::value;
+        bool test = nanovdb::util::is_same<nanovdb::pointer_traits<nanovdb::fancy_ptr<float>>::element_type, const float>::value;
         EXPECT_TRUE(test);
         EXPECT_EQ(sizeof(float),  nanovdb::pointer_traits<nanovdb::fancy_ptr<float>>::element_size);
-        test = nanovdb::is_same<nanovdb::pointer_traits<nanovdb::fancy_ptr<const float>>::element_type, const float>::value;
+        test = nanovdb::util::is_same<nanovdb::pointer_traits<nanovdb::fancy_ptr<const float>>::element_type, const float>::value;
         EXPECT_TRUE(test);
         EXPECT_EQ(sizeof(float),  nanovdb::pointer_traits<nanovdb::fancy_ptr<const float>>::element_size);
     }
@@ -2426,13 +2553,13 @@ TEST(TestNanoVDBCUDA, CudaGridChecksum)
     const std::string s{"The quick brown fox jumps over the lazy dog"};
     { // test CPU implementation of crc32 without a lookup table
         std::stringstream ss;
-        ss << std::hex << std::setw(8) << std::setfill('0') << nanovdb::crc32::checksum(s.c_str(), s.size());
+        ss << std::hex << std::setw(8) << std::setfill('0') << nanovdb::util::crc32(s.c_str(), s.size());
         EXPECT_EQ("414fa339", ss.str());// 414FA339 from https://rosettagit.org/drafts/crc-32/#c-1
     }
     { // test CPU implementation of crc32 with a lookup table
-        auto lut = nanovdb::crc32::createLut();
+        auto lut = nanovdb::util::createCrc32Lut();
         std::stringstream ss;
-        ss << std::hex << std::setw(8) << std::setfill('0') << nanovdb::crc32::checksum(s.c_str(), s.size(), lut.get());
+        ss << std::hex << std::setw(8) << std::setfill('0') << nanovdb::util::crc32(s.c_str(), s.size(), lut.get());
         EXPECT_EQ("414fa339", ss.str());// 414FA339 from https://rosettagit.org/drafts/crc-32/#c-1
     }
     {// test GPU implementation
@@ -2441,7 +2568,7 @@ TEST(TestNanoVDBCUDA, CudaGridChecksum)
         cudaCheck(cudaMalloc((void**)&d_checksum, 4));
         cudaCheck(cudaMalloc((void**)&d_str, s.size()));
         cudaCheck(cudaMemcpy(d_str, s.data(), s.size(), cudaMemcpyHostToDevice));
-        nanovdb::crc32::checksumKernel<<<1, 1>>>((const uint8_t*)d_str, d_checksum, 1, s.size());
+        nanovdb::util::cuda::crc32Kernel<<<1, 1>>>((const uint8_t*)d_str, d_checksum, 1, s.size());
         cudaCheck(cudaMemcpy(&checksum, d_checksum, 4, cudaMemcpyDeviceToHost));
         cudaCheck(cudaFree(d_str));
         cudaCheck(cudaFree(d_checksum));
@@ -2449,7 +2576,7 @@ TEST(TestNanoVDBCUDA, CudaGridChecksum)
         ss << std::hex << std::setw(8) << std::setfill('0') << checksum;
         EXPECT_EQ("414fa339", ss.str());// 414FA339 from https://rosettagit.org/drafts/crc-32/#c-1
     }
-    auto handle = nanovdb::createLevelSetSphere<float, nanovdb::CudaDeviceBuffer>(100);
+    auto handle = nanovdb::tools::createLevelSetSphere<float, nanovdb::cuda::DeviceBuffer>(100);
     EXPECT_TRUE(handle.data());
     auto *grid = handle.grid<float>();
     EXPECT_TRUE(grid);
@@ -2458,38 +2585,40 @@ TEST(TestNanoVDBCUDA, CudaGridChecksum)
 #if 0// entire grid or just GridData+TreeData+RootData
     const size_t size = handle.size();
 #else
-    const uint64_t size = grid->memUsage() + grid->tree().memUsage() + grid->tree().root().memUsage() - 16;
+    //const uint64_t size = grid->memUsage() + grid->tree().memUsage() + grid->tree().root().memUsage() - 16;
+    const uint64_t size = grid->memUsage() + grid->tree().memUsage() - 16;
 #endif
     //std::cerr << "Grid + tree + root data is " << size << " bytes\n";
-    nanovdb::CpuTimer cpuTimer;
-    nanovdb::GpuTimer gpuTimer;
+    nanovdb::util::Timer       cpuTimer;
+    nanovdb::util::cuda::Timer gpuTimer;
+    auto  lut = nanovdb::util::createCrc32Lut();
+    void *ptr = nanovdb::util::PtrAdd(handle.data(), 16);
     {//benchmark CPU version that uses a table
         //cpuTimer.start("CPU Tabled CRC of level set sphere");
-        auto lut = nanovdb::crc32::createLut();
-        checksum = nanovdb::crc32::checksum(handle.data()+16, size, lut.get());
+        checksum = nanovdb::util::crc32(ptr, size, lut.get());
         //cpuTimer.stop();
         //std::cerr << checksum << std::endl;
     }
     {//benchmark CPU version that uses no table
         //cpuTimer.start("CPU Untabled CRC of level set sphere");
-        auto checksum2 = nanovdb::crc32::checksum(handle.data()+16, size);
+        auto checksum2 = nanovdb::util::crc32(ptr, size);
         //cpuTimer.stop();
         //std::cerr << checksum2 << std::endl;
         EXPECT_EQ(checksum, checksum2);
     }
     {//benchmark CPU version that uses table
-        //cpuTimer.start("CPU tabled crc32::CRC of level set sphere");
-        auto lut = nanovdb::crc32::createLut();
-        auto checksum2 = nanovdb::crc32::checksum(handle.data()+16, size, lut.get());
+        //cpuTimer.start("CPU tabled util::CRC of level set sphere");
+        auto checksum2 = nanovdb::util::crc32(ptr, size, lut.get());
         //cpuTimer.stop();
         //std::cerr << checksum2 << std::endl;
         EXPECT_EQ(checksum, checksum2);
     }
     uint32_t checksum2, *d_checksum;
     cudaCheck(cudaMalloc((void**)&d_checksum, 4));
+    void *d_ptr = nanovdb::util::PtrAdd(handle.deviceData(), 16);
     {//benchmark GPU version that uses no table
         //gpuTimer.start("GPU Untabled CRC of level set sphere");
-        nanovdb::crc32::checksumKernel<<<1, 1>>>(handle.deviceData()+16, d_checksum, 1, size);
+        nanovdb::util::cuda::crc32Kernel<<<1, 1>>>(d_ptr, d_checksum, 1, size);
         //gpuTimer.stop();
         cudaCheck(cudaMemcpy(&checksum2, d_checksum, 4, cudaMemcpyDeviceToHost));
         //std::cerr << checksum2 << std::endl;
@@ -2497,38 +2626,38 @@ TEST(TestNanoVDBCUDA, CudaGridChecksum)
     }
     {//benchmark GPU version that uses no table
         //gpuTimer.start("GPU tabled CRC of level set sphere");
-        uint32_t *d_lut = nanovdb::crc32::cudaCreateLut();
-        nanovdb::crc32::checksumKernel<<<1, 1>>>(handle.deviceData()+16, d_checksum, 1, size, d_lut);
+        auto lut = nanovdb::util::cuda::createCrc32Lut();
+        uint32_t *d_lut = lut.get();
+        nanovdb::util::cuda::crc32Kernel<<<1, 1>>>(d_ptr, d_checksum, 1, size, d_lut);
         //gpuTimer.stop();
         cudaCheck(cudaMemcpy(&checksum2, d_checksum, 4, cudaMemcpyDeviceToHost));
-        cudaCheck(cudaFree(d_lut));
         //std::cerr << checksum2 << std::endl;
         EXPECT_EQ(checksum, checksum2);
     }
     {
         //cpuTimer.start("CPU GridChecksum of level set sphere");
-        nanovdb::GridChecksum cs;
-        cs(*grid);
-        checksum2 = cs.checksum(0);// only check the checksum of grid, tree and root data
+        nanovdb::Checksum cs = nanovdb::tools::evalChecksum(grid, nanovdb::CheckMode::Partial);
+        //cs(*grid);
+        //checksum2 = cs.checksum(0);// only check the checksum of grid, tree and root data
         //cpuTimer.stop();
         //std::cerr << checksum2 << std::endl;
-        EXPECT_EQ(checksum, checksum2);
+        EXPECT_EQ(checksum, cs.head());
     }
-    uint64_t fullChecksum;
+    nanovdb::Checksum fullChecksum;
     {
         //cpuTimer.start("CPU FULL cudaGridChecksum tabled CRC of level set sphere");
-        nanovdb::updateChecksum(*handle.grid<float>(), nanovdb::ChecksumMode::Full);
+        nanovdb::tools::updateChecksum(handle.grid<float>(), nanovdb::CheckMode::Full);
         //cpuTimer.stop();
         fullChecksum = handle.grid<float>()->checksum();
-        EXPECT_EQ(checksum, fullChecksum & 0xFFFFFFFF);
+        EXPECT_EQ(checksum, fullChecksum.head());
     }
     {
         //gpuTimer.start("GPU FULL cudaGridChecksum tabled CRC of level set sphere");
-        nanovdb::cudaGridChecksum(handle.deviceGrid<float>(), nanovdb::ChecksumMode::Full);
+        nanovdb::tools::cuda::updateChecksum(handle.deviceGrid<float>(), nanovdb::CheckMode::Full);
         //gpuTimer.stop();
-        uint64_t fullChecksum2;
+        nanovdb::Checksum fullChecksum2;
         cudaCheck(cudaMemcpy(&fullChecksum2, (const uint8_t*)handle.deviceGrid<float>() + 8, 8, cudaMemcpyDeviceToHost));
-        EXPECT_EQ(checksum, fullChecksum2 & 0xFFFFFFFF);
+        EXPECT_EQ(checksum, fullChecksum2.head());
         EXPECT_EQ(fullChecksum, fullChecksum2);
     }
     cudaCheck(cudaFree(d_checksum));
@@ -2539,7 +2668,7 @@ size_t countActiveVoxels(const nanovdb::NodeManager<BuildT> *d_mgr)
 {
     size_t count[2], *d_count;
     cudaCheck(cudaMalloc((void**)&d_count, 2*sizeof(size_t)));
-    cudaLambdaKernel<<<1,1>>>(1, [=] __device__ (size_t){
+    nanovdb::util::cuda::lambdaKernel<<<1,1>>>(1, [=] __device__ (size_t){
         d_count[0] = 0;
         for (int i=0; i<d_mgr->leafCount();  ++i)  d_count[0] += d_mgr->leaf(i).valueMask().countOn();
         for (int i=0; i<d_mgr->lowerCount(); ++i)  d_count[0] += d_mgr->lower(i).valueMask().countOn();
@@ -2555,7 +2684,7 @@ size_t countActiveVoxels(const nanovdb::NodeManager<BuildT> *d_mgr)
 
 TEST(TestNanoVDBCUDA, NodeManager)
 {
-    auto handle = nanovdb::createLevelSetSphere<float, nanovdb::CudaDeviceBuffer>(100);
+    auto handle = nanovdb::tools::createLevelSetSphere<float, nanovdb::cuda::DeviceBuffer>(100);
     EXPECT_TRUE(handle.data());
     auto *grid = handle.grid<float>();
     EXPECT_TRUE(grid);
@@ -2563,7 +2692,7 @@ TEST(TestNanoVDBCUDA, NodeManager)
     auto *d_grid = handle.deviceGrid<float>();
     EXPECT_TRUE(d_grid);
     size_t count = 0;
-    nanovdb::CpuTimer cpuTimer;
+    nanovdb::util::Timer cpuTimer;
     {
         //cpuTimer.start("CPU NodeManager");
         auto handle2 = nanovdb::createNodeManager<>(*grid);
@@ -2573,10 +2702,10 @@ TEST(TestNanoVDBCUDA, NodeManager)
         count = mgr->grid().tree().activeVoxelCount();
     }
 
-    nanovdb::GpuTimer gpuTimer;
+    nanovdb::util::cuda::Timer gpuTimer;
     {
         //gpuTimer.start("GPU NodeManager");
-        auto handle2 = nanovdb::cudaCreateNodeManager(d_grid);
+        auto handle2 = nanovdb::cuda::createNodeManager(d_grid);
         //gpuTimer.stop();
         auto *d_mgr = handle2.deviceMgr<float>();
         EXPECT_TRUE(d_mgr);
@@ -2587,13 +2716,13 @@ TEST(TestNanoVDBCUDA, NodeManager)
 TEST(TestNanoVDBCUDA, GridStats)
 {
     using GridT = nanovdb::NanoGrid<float>;
-    auto handle = nanovdb::createLevelSetSphere<float, nanovdb::CudaDeviceBuffer>(100,
+    auto handle = nanovdb::tools::createLevelSetSphere<float, nanovdb::cuda::DeviceBuffer>(100,
                                                                                   nanovdb::Vec3d(0),
                                                                                   1.0,
                                                                                   3.0,
                                                                                   nanovdb::Vec3d(0),
                                                                                   "test",
-                                                                                  nanovdb::StatsMode::Disable);
+                                                                                  nanovdb::tools::StatsMode::Disable);
     EXPECT_TRUE(handle.data());
     GridT *grid = handle.grid<float>();
     EXPECT_TRUE(grid);
@@ -2624,8 +2753,8 @@ TEST(TestNanoVDBCUDA, GridStats)
         EXPECT_EQ(n0, grid->tree().nodeCount(0));
     }
     {
-        //nanovdb::CpuTimer cpuTimer("CPU gridStats: Default = Full");
-        nanovdb::gridStats(*grid);
+        //nanovdb::util::Timer cpuTimer("CPU gridStats: Default = Full");
+        nanovdb::tools::updateGridStats(grid);
         //cpuTimer.stop();
     }
     {// check min/max using const iterators
@@ -2674,8 +2803,8 @@ TEST(TestNanoVDBCUDA, GridStats)
     }
 
     {
-        //nanovdb::GpuTimer gpuTimer("GPU gridStats: Default = Full");
-        nanovdb::cudaGridStats(d_grid);
+        //nanovdb::util::cuda::Timer gpuTimer("GPU gridStats: Default = Full");
+        nanovdb::tools::cuda::updateGridStats(d_grid);
         //gpuTimer.stop();
     }
     {// check bbox and stats of device grid
@@ -2691,3 +2820,32 @@ TEST(TestNanoVDBCUDA, GridStats)
         EXPECT_EQ(grid->tree().root().stdDeviation(), data->mStdDevi);
     }
 }// GridStats
+
+TEST(TestNanoVDBCUDA, cudaIsValid)
+{
+    const auto mode = nanovdb::CheckMode::Full;
+    using GridT = nanovdb::NanoGrid<float>;
+    auto handle = nanovdb::tools::createLevelSetSphere<float, nanovdb::cuda::DeviceBuffer>(100,
+                                                                                  nanovdb::Vec3d(0),
+                                                                                  1.0,
+                                                                                  3.0,
+                                                                                  nanovdb::Vec3d(0),
+                                                                                  "test",
+                                                                                  nanovdb::tools::StatsMode::Disable,
+                                                                                  mode);
+    EXPECT_TRUE(handle.data());
+    GridT *grid = handle.grid<float>();
+    EXPECT_TRUE(grid);
+    handle.deviceUpload();
+    GridT *d_grid = handle.deviceGrid<float>();
+    EXPECT_TRUE(d_grid);
+    const bool verbose = false;
+
+    EXPECT_TRUE(nanovdb::isValid(grid,        mode, verbose));
+    EXPECT_TRUE(nanovdb::tools::cuda::isValid(d_grid,  mode, verbose));
+
+    grid->mGridType = nanovdb::GridType::Vec3f;
+    EXPECT_FALSE(nanovdb::isValid(grid,       mode, verbose));
+    handle.deviceUpload();
+    EXPECT_FALSE(nanovdb::tools::cuda::isValid(d_grid, mode, verbose));
+}// cudaIsValid
diff --git a/nanovdb/nanovdb/unittest/TestOpenVDB.cc b/nanovdb/nanovdb/unittest/TestOpenVDB.cc
index e14792cb81..06b9da7f4c 100644
--- a/nanovdb/nanovdb/unittest/TestOpenVDB.cc
+++ b/nanovdb/nanovdb/unittest/TestOpenVDB.cc
@@ -7,22 +7,22 @@
 #include <cstdio>// for FILE
 #include <cmath>
 
-#include <nanovdb/util/IO.h>
-#include <nanovdb/util/CreateNanoGrid.h>
-#include <nanovdb/util/NanoToOpenVDB.h>
-#include <nanovdb/util/GridValidator.h>
-#include <nanovdb/util/SampleFromVoxels.h>
-#include <nanovdb/util/Primitives.h>
-#include <nanovdb/util/NodeManager.h>
-#include <nanovdb/util/Ray.h>
-#include <nanovdb/util/GridStats.h>
-#include <nanovdb/util/HDDA.h>
-#include <nanovdb/util/CpuTimer.h>
-#include <nanovdb/util/GridBuilder.h>
+#include <nanovdb/io/IO.h>
+#include <nanovdb/tools/CreateNanoGrid.h>
+#include <nanovdb/tools/NanoToOpenVDB.h>
+#include <nanovdb/tools/GridValidator.h>
+#include <nanovdb/math/SampleFromVoxels.h>
+#include <nanovdb/tools/CreatePrimitives.h>
+#include <nanovdb/NodeManager.h>
+#include <nanovdb/math/Ray.h>
+#include <nanovdb/tools/GridStats.h>
+#include <nanovdb/math/HDDA.h>
+#include <nanovdb/util/Timer.h>
+#include <nanovdb/tools/GridBuilder.h>
 
 #if !defined(_MSC_VER) // does not compile in msvc c++ due to zero-sized arrays.
 #include <nanovdb/CNanoVDB.h>
-#include <nanovdb/util/CSampleFromVoxels.h>
+#include <nanovdb/math/CSampleFromVoxels.h>
 #endif
 
 #include <openvdb/openvdb.h>
@@ -58,12 +58,14 @@ class TestOpenVDB : public ::testing::Test
     void SetUp() override
     {
         openvdb::initialize();
+        mStr = new char[256];
         // Code here will be called immediately after the constructor (right
         // before each test).
     }
 
     void TearDown() override
     {
+        delete [] mStr;
         // Code here will be called immediately after each test (right
         // before the destructor).
     }
@@ -183,6 +185,7 @@ class TestOpenVDB : public ::testing::Test
     }
 
     openvdb::util::CpuTimer mTimer;
+    char *mStr;
 }; // TestOpenVDB
 
 // make -j && ./unittest/testOpenVDB --gtest_break_on_failure --gtest_filter="*getExtrema"
@@ -190,7 +193,7 @@ TEST_F(TestOpenVDB, getExtrema)
 {
     using wBBoxT = openvdb::math::BBox<openvdb::Vec3d>;
     auto srcGrid = this->getSrcGrid(false, 0, 3);// level set of a bunny if available, else an octahedron
-    auto handle = nanovdb::createNanoGrid(*srcGrid, nanovdb::StatsMode::All);
+    auto handle = nanovdb::tools::createNanoGrid(*srcGrid, nanovdb::tools::StatsMode::All);
     EXPECT_TRUE(handle);
     auto* dstGrid = handle.grid<float>();
     EXPECT_TRUE(dstGrid);
@@ -205,15 +208,15 @@ TEST_F(TestOpenVDB, getExtrema)
     const wBBoxT iBBox = wBBox.applyInverseMap(*indexToWorldMap);
     //std::cerr << "Query bbox: iBBox = " << iBBox << ", wBBox = " << wBBox << std::endl;
 
-    const nanovdb::CoordBBox bbox(nanovdb::Round<nanovdb::Coord>(iBBox.min()),
-                                  nanovdb::Round<nanovdb::Coord>(iBBox.max()));
+    const nanovdb::CoordBBox bbox(nanovdb::math::Round<nanovdb::Coord>(iBBox.min()),
+                                  nanovdb::math::Round<nanovdb::Coord>(iBBox.max()));
     //std::cerr << "Query index bbox = " << bbox << std::endl;
 
     //nanovdb::NodeManager<nanovdb::FloatGrid> mgr(*dstGrid);
     //std::cerr << "Root child nodes: " << mgr.nodeCount(2) << std::endl;
 
     //mTimer.start("getExtrema");
-    nanovdb::Extrema<float> ext1 = nanovdb::getExtrema(*dstGrid, bbox), ext2;
+    nanovdb::tools::Extrema<float> ext1 = nanovdb::tools::getExtrema(*dstGrid, bbox), ext2;
     //mTimer.restart("naive approach");
     for (auto it = bbox.begin(); it; ++it) ext2.add(dstAcc.getValue(*it));
     //mTimer.stop();
@@ -242,9 +245,9 @@ TEST_F(TestOpenVDB, MapToNano)
         EXPECT_EQ(ijk2, nanovdb::Coord(1, 2, -4));
     }
     {// Vec3f
-        constexpr bool test1 = nanovdb::is_same<nanovdb::Vec3f, nanovdb::MapToNano<openvdb::Vec3f>::type>::value;
+        constexpr bool test1 = nanovdb::util::is_same<nanovdb::Vec3f, nanovdb::tools::MapToNano<openvdb::Vec3f>::type>::value;
         EXPECT_TRUE(test1);
-        constexpr bool test2 = nanovdb::is_same<nanovdb::Vec3d, nanovdb::MapToNano<openvdb::Vec3f>::type>::value;
+        constexpr bool test2 = nanovdb::util::is_same<nanovdb::Vec3d, nanovdb::tools::MapToNano<openvdb::Vec3f>::type>::value;
         EXPECT_FALSE(test2);
         const openvdb::Vec3f xyz1(1, 2, -4);
         nanovdb::Vec3f xyz2(-2, 7, 9);
@@ -253,9 +256,9 @@ TEST_F(TestOpenVDB, MapToNano)
         EXPECT_EQ(xyz2, nanovdb::Vec3f(1, 2, -4));
     }
     {// Vec4d
-        constexpr bool test1 = nanovdb::is_same<nanovdb::Vec4d, nanovdb::MapToNano<openvdb::Vec4d>::type>::value;
+        constexpr bool test1 = nanovdb::util::is_same<nanovdb::Vec4d, nanovdb::tools::MapToNano<openvdb::Vec4d>::type>::value;
         EXPECT_TRUE(test1);
-        constexpr bool test2 = nanovdb::is_same<nanovdb::Vec4f, nanovdb::MapToNano<openvdb::Vec4d>::type>::value;
+        constexpr bool test2 = nanovdb::util::is_same<nanovdb::Vec4f, nanovdb::tools::MapToNano<openvdb::Vec4d>::type>::value;
         EXPECT_FALSE(test2);
         const openvdb::Vec4d xyz1(1, 2, -4, 7);
         nanovdb::Vec4d xyz2(-2, 7, 9, -4);
@@ -264,9 +267,9 @@ TEST_F(TestOpenVDB, MapToNano)
         EXPECT_EQ(xyz2, nanovdb::Vec4d(1, 2, -4, 7));
     }
     {// MaskValue
-        constexpr bool test1 = nanovdb::is_same<nanovdb::ValueMask, nanovdb::MapToNano<openvdb::ValueMask>::type>::value;
+        constexpr bool test1 = nanovdb::util::is_same<nanovdb::ValueMask, nanovdb::tools::MapToNano<openvdb::ValueMask>::type>::value;
         EXPECT_TRUE(test1);
-        constexpr bool test2 = nanovdb::is_same<nanovdb::Vec3f, nanovdb::MapToNano<openvdb::ValueMask>::type>::value;
+        constexpr bool test2 = nanovdb::util::is_same<nanovdb::Vec3f, nanovdb::tools::MapToNano<openvdb::ValueMask>::type>::value;
         EXPECT_FALSE(test2);
         EXPECT_EQ(sizeof(nanovdb::ValueMask), sizeof(openvdb::ValueMask));
     }
@@ -295,8 +298,8 @@ TEST_F(TestOpenVDB, BasicGrid)
 
     const std::string name("test name");
 
-    EXPECT_EQ(nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(8 + 8 + 2 + 2 + 4 + 8 + nanovdb::GridData::MaxNameSize + 48 + sizeof(nanovdb::Map) + 24 + 4 + 4 + 8 + 4), sizeof(GridT));
-    EXPECT_EQ(nanovdb::AlignUp<NANOVDB_DATA_ALIGNMENT>(4*8 + 2 * 4 * 3 + 8), sizeof(TreeT));
+    EXPECT_EQ(nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(8 + 8 + 2 + 2 + 4 + 8 + nanovdb::GridData::MaxNameSize + 48 + sizeof(nanovdb::Map) + 24 + 4 + 4 + 8 + 4), sizeof(GridT));
+    EXPECT_EQ(nanovdb::math::AlignUp<NANOVDB_DATA_ALIGNMENT>(4*8 + 2 * 4 * 3 + 8), sizeof(TreeT));
     EXPECT_EQ(size_t(4*8 + 2 * 4 * 3 + 8), sizeof(TreeT));// should already be 32 byte aligned
 
     size_t bytes[9];
@@ -524,7 +527,7 @@ TEST_F(TestOpenVDB, BasicGrid)
         EXPECT_EQ(uint32_t(NANOVDB_PATCH_VERSION_NUMBER), grid->version().getPatch());
         EXPECT_TRUE(grid->isValid());
         EXPECT_EQ(grid->gridType(), nanovdb::GridType::Float);
-        EXPECT_EQ(grid->gridClass(), nanovdb::GridClass::Unknown);
+        EXPECT_EQ(grid->gridClass(),nanovdb::GridClass::Unknown);
         EXPECT_FALSE(grid->isLevelSet());
         EXPECT_FALSE(grid->isFogVolume());
         EXPECT_FALSE(grid->isStaggered());
@@ -553,12 +556,35 @@ TEST_F(TestOpenVDB, BasicGrid)
     }
 } // BaseGrid
 
+
+TEST_F(TestOpenVDB, MagicType)
+{
+    {// toMagic(uint64_t)
+        EXPECT_EQ( nanovdb::toMagic(NANOVDB_MAGIC_NUMB), nanovdb::MagicType::NanoVDB );
+        EXPECT_EQ( nanovdb::toMagic(NANOVDB_MAGIC_GRID), nanovdb::MagicType::NanoGrid );
+        EXPECT_EQ( nanovdb::toMagic(NANOVDB_MAGIC_FILE), nanovdb::MagicType::NanoFile );
+        EXPECT_EQ( nanovdb::toMagic(NANOVDB_MAGIC_NODE), nanovdb::MagicType::NanoNode );
+        EXPECT_EQ( nanovdb::toMagic(NANOVDB_MAGIC_FRAG), nanovdb::MagicType::NanoFrag );
+        EXPECT_EQ( nanovdb::toMagic(      0x56444220UL), nanovdb::MagicType::OpenVDB );
+    }
+
+    {// toStr(MagicType)
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::MagicType::Unknown ),  "unknown"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::MagicType::OpenVDB ),  "openvdb"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::MagicType::NanoVDB ),  "nanovdb"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::MagicType::NanoGrid ), "nanovdb::Grid"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::MagicType::NanoFile ), "nanovdb::File"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::MagicType::NanoNode ), "nanovdb::NodeManager"), 0 );
+        EXPECT_EQ( strcmp(nanovdb::toStr(mStr, nanovdb::MagicType::NanoFrag ), "fragmented nanovdb::Grid"), 0 );
+    }
+}
+
 TEST_F(TestOpenVDB, OpenToNanoVDB_Empty)
 {
     { // empty grid
         openvdb::FloatGrid srcGrid(0.0f);
         auto srcAcc = srcGrid.getAccessor();
-        auto handle = nanovdb::createNanoGrid(srcGrid);
+        auto handle = nanovdb::tools::createNanoGrid(srcGrid);
         EXPECT_TRUE(handle);
         auto* meta = handle.gridMetaData();
         EXPECT_TRUE(meta);
@@ -593,7 +619,7 @@ TEST_F(TestOpenVDB, OpenToNanoVDB_Basic1)
         srcAcc.setValue(openvdb::Coord(1, 2, 3), 1.0f);
         EXPECT_TRUE(srcAcc.isValueOn(openvdb::Coord(1, 2, 3)));
         EXPECT_EQ(1.0f, srcAcc.getValue(openvdb::Coord(1, 2, 3)));
-        auto handle = nanovdb::createNanoGrid(srcGrid, nanovdb::StatsMode::All);
+        auto handle = nanovdb::tools::createNanoGrid(srcGrid, nanovdb::tools::StatsMode::All);
         EXPECT_TRUE(handle);
         auto* meta = handle.gridMetaData();
         EXPECT_TRUE(meta);
@@ -632,13 +658,13 @@ TEST_F(TestOpenVDB, OpenToNanoVDB_Model)
 {
     auto srcGrid = this->getSrcGrid(false);
     //mTimer.start("Generating NanoVDB grid");
-    auto handle = nanovdb::createNanoGrid(*srcGrid);
+    auto handle = nanovdb::tools::createNanoGrid(*srcGrid);
     //mTimer.start("Writing NanoVDB grid");
     nanovdb::io::writeGrid("data/test.nvdb", handle, this->getCodec());
     //mTimer.stop();
 
     auto dstGrid = handle.grid<float>();
-    EXPECT_TRUE(nanovdb::isValid(dstGrid));
+    EXPECT_TRUE(nanovdb::isAligned(dstGrid));
 
     auto kernel = [&](const openvdb::CoordBBox& bbox) {
         using CoordT = const nanovdb::Coord;
@@ -679,9 +705,9 @@ TEST_F(TestOpenVDB, OpenToNanoVDB_Fp4)
         EXPECT_EQ(2.0f, srcAcc.getValue(openvdb::Coord(-10, 20,-50)));
         EXPECT_EQ(3.0f, srcAcc.getValue(openvdb::Coord( 50,-12, 30)));
 
-        nanovdb::CreateNanoGrid<openvdb::FloatGrid> converter(srcGrid);
+        nanovdb::tools::CreateNanoGrid<openvdb::FloatGrid> converter(srcGrid);
         //converter.setVerbose();
-        converter.setStats(nanovdb::StatsMode::All);
+        converter.setStats(nanovdb::tools::StatsMode::All);
         auto handle = converter.getHandle<nanovdb::Fp4>();// (srcGrid);
 
         EXPECT_TRUE(handle);
@@ -733,7 +759,7 @@ TEST_F(TestOpenVDB, OpenToNanoVDB_Fp4)
     {// Model
         auto openGrid = this->getSrcGrid(false);
         const float tolerance = 0.5f*openGrid->voxelSize()[0];
-        nanovdb::CreateNanoGrid<openvdb::FloatGrid> converter(*openGrid);
+        nanovdb::tools::CreateNanoGrid<openvdb::FloatGrid> converter(*openGrid);
         converter.enableDithering();
         //converter.setVerbose(2);
         auto handle = converter.getHandle<nanovdb::Fp4>();
@@ -774,8 +800,8 @@ TEST_F(TestOpenVDB, OpenToNanoVDB_Fp8)
         EXPECT_EQ(2.0f, srcAcc.getValue(openvdb::Coord(-10, 20,-50)));
         EXPECT_EQ(3.0f, srcAcc.getValue(openvdb::Coord( 50,-12, 30)));
 
-        nanovdb::CreateNanoGrid<openvdb::FloatGrid> converter(srcGrid);
-        converter.setStats(nanovdb::StatsMode::All);
+        nanovdb::tools::CreateNanoGrid<openvdb::FloatGrid> converter(srcGrid);
+        converter.setStats(nanovdb::tools::StatsMode::All);
         auto handle = converter.getHandle<nanovdb::Fp8>();
 
         EXPECT_TRUE(handle);
@@ -816,7 +842,7 @@ TEST_F(TestOpenVDB, OpenToNanoVDB_Fp8)
     {// Model
         auto openGrid = this->getSrcGrid(false);
         const float tolerance = 0.05f*openGrid->voxelSize()[0];
-        nanovdb::CreateNanoGrid<openvdb::FloatGrid> converter(*openGrid);
+        nanovdb::tools::CreateNanoGrid<openvdb::FloatGrid> converter(*openGrid);
         auto handle = converter.getHandle<nanovdb::Fp8>();
         converter.enableDithering();
         //converter.setVerbose(2);
@@ -858,9 +884,9 @@ TEST_F(TestOpenVDB, OpenToNanoVDB_Fp16)
         EXPECT_EQ(2.0f, srcAcc.getValue(openvdb::Coord(-10, 20,-50)));
         EXPECT_EQ(3.0f, srcAcc.getValue(openvdb::Coord( 50,-12, 30)));
 
-        nanovdb::CreateNanoGrid<openvdb::FloatGrid> converter(srcGrid);
+        nanovdb::tools::CreateNanoGrid<openvdb::FloatGrid> converter(srcGrid);
         //converter.setVerbose(2);
-        converter.setStats(nanovdb::StatsMode::All);
+        converter.setStats(nanovdb::tools::StatsMode::All);
         auto handle = converter.getHandle<nanovdb::Fp16>();
 
         EXPECT_TRUE(handle);
@@ -902,7 +928,7 @@ TEST_F(TestOpenVDB, OpenToNanoVDB_Fp16)
     {// Model
         auto openGrid = this->getSrcGrid(false);
         const float tolerance = 0.005f*openGrid->voxelSize()[0];
-        nanovdb::CreateNanoGrid<openvdb::FloatGrid> converter(*openGrid);
+        nanovdb::tools::CreateNanoGrid<openvdb::FloatGrid> converter(*openGrid);
         converter.enableDithering();
         auto handle = converter.getHandle<nanovdb::Fp16>();
         //converter.setVerbose(2);
@@ -944,8 +970,8 @@ TEST_F(TestOpenVDB, OpenToNanoVDB_FpN)
         EXPECT_EQ(2.0f, srcAcc.getValue(openvdb::Coord(-10, 20,-50)));
         EXPECT_EQ(3.0f, srcAcc.getValue(openvdb::Coord( 50,-12, 30)));
 
-        nanovdb::CreateNanoGrid<openvdb::FloatGrid> converter(srcGrid);
-        converter.setStats(nanovdb::StatsMode::All);
+        nanovdb::tools::CreateNanoGrid<openvdb::FloatGrid> converter(srcGrid);
+        converter.setStats(nanovdb::tools::StatsMode::All);
         auto handle = converter.getHandle<nanovdb::FpN>();
 
         EXPECT_TRUE(handle);
@@ -990,11 +1016,11 @@ TEST_F(TestOpenVDB, OpenToNanoVDB_FpN)
 #else
         auto openGrid = this->getSrcGrid(true, 1, 1);// FOG volume of Disney cloud or cube
 #endif
-        nanovdb::CreateNanoGrid<openvdb::FloatGrid> converter(*openGrid);
+        nanovdb::tools::CreateNanoGrid<openvdb::FloatGrid> converter(*openGrid);
         //converter.setVerbose(2);
 
         const float tolerance = 0.05f;
-        nanovdb::AbsDiff oracle(tolerance);
+        nanovdb::tools::AbsDiff oracle(tolerance);
 
         auto handle = converter.getHandle<nanovdb::FpN>(oracle);
         auto* nanoGrid = handle.grid<nanovdb::FpN>();
@@ -1014,13 +1040,13 @@ TEST_F(TestOpenVDB, OpenToNanoVDB_FpN)
                 EXPECT_TRUE( oracle(exact, approx) );
             }
         };
-        nanovdb::forEach(openGrid->evalActiveVoxelBoundingBox(), kernel);
+        nanovdb::util::forEach(openGrid->evalActiveVoxelBoundingBox(), kernel);
 
         handle = nanovdb::io::readGrid("data/test_fpN.nvdb");
         nanoGrid = handle.grid<nanovdb::FpN>();
         EXPECT_TRUE(nanoGrid);
 
-        nanovdb::forEach(openGrid->evalActiveVoxelBoundingBox(), kernel);
+        nanovdb::util::forEach(openGrid->evalActiveVoxelBoundingBox(), kernel);
     }
 } // OpenToNanoVDB_FpN
 
@@ -1098,7 +1124,7 @@ TEST_F(TestOpenVDB, PointIndexGrid)
     EXPECT_EQ(pointCount, count);
 
     //mTimer.start("Generating NanoVDB grid from PointIndexGrid");
-    auto handle = nanovdb::createNanoGrid(*srcGrid, nanovdb::StatsMode::All, nanovdb::ChecksumMode::Full);
+    auto handle = nanovdb::tools::createNanoGrid(*srcGrid, nanovdb::tools::StatsMode::All, nanovdb::CheckMode::Full);
     //mTimer.stop();
     EXPECT_TRUE(handle);
     auto* meta = handle.gridMetaData();
@@ -1210,7 +1236,7 @@ TEST_F(TestOpenVDB, PointDataGridBasic)
     srcGrid->setName("PointDataGrid");
 
     //mTimer.start("Generating NanoVDB grid from PointDataGrid");
-    auto handle = nanovdb::createNanoGrid(*srcGrid);
+    auto handle = nanovdb::tools::createNanoGrid(*srcGrid);
     //mTimer.stop();
 
     EXPECT_TRUE(handle);
@@ -1242,7 +1268,7 @@ TEST_F(TestOpenVDB, PointDataGridBasic)
         // Create a read-only AttributeHandle. Position always uses Vec3f.
         openvdb::points::AttributeHandle<openvdb::Vec3f> positionHandle(leafIter->constAttributeArray("P"));
         openvdb::Coord ijkSrc(openvdb::Coord::min());
-        nanovdb::Coord ijkDst(nanovdb::Maximum<int>::value());
+        nanovdb::Coord ijkDst(nanovdb::math::Maximum<int>::value());
         for (auto indexIter = leafIter->beginIndexOn(); indexIter; ++indexIter) {
             // Extract the local voxel-space position of the point relative to its occupying voxel ijk.
             const openvdb::Vec3f vxlSrc = positionHandle.get(*indexIter);
@@ -1330,7 +1356,7 @@ TEST_F(TestOpenVDB, PointDataGridRandom)
     srcGrid->setName("PointDataGrid");
 
     //mTimer.start("Generating NanoVDB grid from PointDataGrid");
-    auto handle = nanovdb::createNanoGrid(*srcGrid);
+    auto handle = nanovdb::tools::createNanoGrid(*srcGrid);
     //mTimer.stop();
 
     EXPECT_TRUE(handle);
@@ -1354,7 +1380,7 @@ TEST_F(TestOpenVDB, PointDataGridRandom)
         // Create a read-only AttributeHandle. Position always uses Vec3f.
         openvdb::points::AttributeHandle<openvdb::Vec3f> positionHandle(leafIter->constAttributeArray("P"));
         openvdb::Coord ijkSrc(openvdb::Coord::min());
-        nanovdb::Coord ijkDst(nanovdb::Maximum<int>::value());
+        nanovdb::Coord ijkDst(nanovdb::math::Maximum<int>::value());
         for (auto indexIter = leafIter->beginIndexOn(); indexIter; ++indexIter) {
             // Extract the local voxel-space position of the point relative to its occupying voxel ijk.
             const openvdb::Vec3f vxlSrc = positionHandle.get(*indexIter);
@@ -1429,7 +1455,7 @@ TEST_F(TestOpenVDB, CNanoVDB)
 {
     auto srcGrid = this->getSrcGrid();
     //mTimer.start("Generating NanoVDB grid");
-    auto handle = nanovdb::createNanoGrid(*srcGrid);
+    auto handle = nanovdb::tools::createNanoGrid(*srcGrid);
     //mTimer.stop();
     EXPECT_TRUE(handle);
     EXPECT_TRUE(handle.data());
@@ -1460,7 +1486,7 @@ TEST_F(TestOpenVDB, CNanoVDBTrilinear)
 {
     auto srcGrid = this->getSrcGrid();
     //mTimer.start("Generating NanoVDB grid");
-    auto handle = nanovdb::createNanoGrid(*srcGrid);
+    auto handle = nanovdb::tools::createNanoGrid(*srcGrid);
     //mTimer.stop();
     EXPECT_TRUE(handle);
     EXPECT_TRUE(handle.data());
@@ -1502,7 +1528,7 @@ TEST_F(TestOpenVDB, CNanoVDBTrilinearStencil)
 {
     auto srcGrid = this->getSrcGrid();
     //mTimer.start("Generating NanoVDB grid");
-    auto handle = nanovdb::createNanoGrid(*srcGrid);
+    auto handle = nanovdb::tools::createNanoGrid(*srcGrid);
     //mTimer.stop();
     EXPECT_TRUE(handle);
     EXPECT_TRUE(handle.data());
@@ -1543,13 +1569,13 @@ TEST_F(TestOpenVDB, CNanoVDBTrilinearStencil)
 
 TEST_F(TestOpenVDB, NanoToOpenVDB_BuildGrid)
 {// test build::Grid -> NanoVDB -> OpenVDB
-    nanovdb::build::Grid<float> buildGrid(0.0f, "test", nanovdb::GridClass::LevelSet);
+    nanovdb::tools::build::Grid<float> buildGrid(0.0f, "test", nanovdb::GridClass::LevelSet);
     auto buildAcc = buildGrid.getAccessor();
     buildAcc.setValue(nanovdb::Coord(1,  2, 3), 1.0f);
     buildAcc.setValue(nanovdb::Coord(2, -2, 9), 2.0f);
     EXPECT_EQ(1.0f, buildAcc.getValue(nanovdb::Coord(1,  2, 3)));
     EXPECT_EQ(2.0f, buildAcc.getValue(nanovdb::Coord(2, -2, 9)));
-    auto handle = nanovdb::createNanoGrid(buildGrid);
+    auto handle = nanovdb::tools::createNanoGrid(buildGrid);
     EXPECT_TRUE(handle);
     auto* meta = handle.gridMetaData();
     EXPECT_TRUE(meta);
@@ -1565,7 +1591,7 @@ TEST_F(TestOpenVDB, NanoToOpenVDB_BuildGrid)
     EXPECT_EQ(1.0f, nanoAcc.getValue(nanovdb::Coord(1,  2, 3)));
     EXPECT_EQ(2.0f, nanoAcc.getValue(nanovdb::Coord(2, -2, 9)));
 
-    auto openGrid = nanovdb::nanoToOpenVDB(*nanoGrid);
+    auto openGrid = nanovdb::tools::nanoToOpenVDB(*nanoGrid);
     EXPECT_TRUE(openGrid);
     auto openAcc = openGrid->getAccessor();
     EXPECT_EQ(1.0f, openAcc.getValue(openvdb::Coord(1,  2, 3)));
@@ -1594,7 +1620,7 @@ TEST_F(TestOpenVDB, NanoToOpenVDB)
     //std::cerr << "Grid name: " << srcGrid->gridName() << std::endl;
 
     //mTimer.start("Deserializing NanoVDB grid");
-    auto dstGrid = nanovdb::nanoToOpenVDB(*srcGrid);
+    auto dstGrid = nanovdb::tools::nanoToOpenVDB(*srcGrid);
     //mTimer.stop();
     EXPECT_TRUE(dstGrid);
 
@@ -1678,13 +1704,13 @@ TEST_F(TestOpenVDB, MultiFile)
         grid.setName("Int32 grid");
         grid.tree().setValue(openvdb::Coord(-256), 10);
         EXPECT_EQ(1u, grid.activeVoxelCount());
-        handles.push_back(nanovdb::createNanoGrid(grid));
+        handles.push_back(nanovdb::tools::createNanoGrid(grid));
     }
     { // 2: add an empty int32_t grid
         openvdb::Int32Grid grid(-4);
         grid.setName("Int32 grid, empty");
         EXPECT_EQ(0u, grid.activeVoxelCount());
-        handles.push_back(nanovdb::createNanoGrid(grid));
+        handles.push_back(nanovdb::tools::createNanoGrid(grid));
     }
     { // 3: add a ValueMask grid
         openvdb::MaskGrid grid(false);
@@ -1698,7 +1724,7 @@ TEST_F(TestOpenVDB, MultiFile)
         grid.tree().evalActiveVoxelBoundingBox(bbox);
         //std::cerr << bbox << std::endl;
         EXPECT_EQ(openvdb::CoordBBox(min, max), bbox);
-        handles.push_back(nanovdb::createNanoGrid(grid));
+        handles.push_back(nanovdb::tools::createNanoGrid(grid));
     }
     { // 4: add a bool grid
         openvdb::BoolGrid grid(false);
@@ -1707,7 +1733,7 @@ TEST_F(TestOpenVDB, MultiFile)
         EXPECT_EQ(1u, grid.activeVoxelCount());
         grid.tree().setValue(openvdb::Coord( 10, 450, 90), true);
         EXPECT_EQ(2u, grid.activeVoxelCount());
-        handles.push_back(nanovdb::createNanoGrid(grid));
+        handles.push_back(nanovdb::tools::createNanoGrid(grid));
     }
     { // 5: add a Vec3f grid
         openvdb::Vec3fGrid grid(openvdb::Vec3f(0.0f, 0.0f, -1.0f));
@@ -1716,7 +1742,7 @@ TEST_F(TestOpenVDB, MultiFile)
         EXPECT_EQ(0u, grid.activeVoxelCount());
         grid.tree().setValue(openvdb::Coord(-256), openvdb::Vec3f(1.0f, 0.0f, 0.0f));
         EXPECT_EQ(1u, grid.activeVoxelCount());
-        handles.push_back(nanovdb::createNanoGrid(grid));
+        handles.push_back(nanovdb::tools::createNanoGrid(grid));
     }
     { // 6: add a Vec4f grid
         using OpenVDBVec4fGrid = openvdb::Grid<openvdb::tree::Tree4<openvdb::Vec4f, 5, 4, 3>::Type>;
@@ -1727,7 +1753,7 @@ TEST_F(TestOpenVDB, MultiFile)
         EXPECT_EQ(0u, grid.activeVoxelCount());
         grid.tree().setValue(openvdb::Coord(-256), openvdb::Vec4f(1.0f, 0.0f, 0.0f, 0.0f));
         EXPECT_EQ(1u, grid.activeVoxelCount());
-        handles.push_back(nanovdb::createNanoGrid(grid));
+        handles.push_back(nanovdb::tools::createNanoGrid(grid));
         OpenVDBVec4fGrid::unregisterGrid();
     }
     { // 7: add an int64_t grid
@@ -1735,7 +1761,7 @@ TEST_F(TestOpenVDB, MultiFile)
         grid.setName("Int64 grid");
         grid.tree().setValue(openvdb::Coord(0), 10);
         EXPECT_EQ(1u, grid.activeVoxelCount());
-        handles.push_back(nanovdb::createNanoGrid(grid));
+        handles.push_back(nanovdb::tools::createNanoGrid(grid));
     }
     for (int i = 0; i < 10; ++i) {// 8 -> 17
         const float          radius = 100.0f;
@@ -1743,7 +1769,7 @@ TEST_F(TestOpenVDB, MultiFile)
         const openvdb::Vec3f center(i * 10.0f, 0.0f, 0.0f);
         auto                 srcGrid = openvdb::tools::createLevelSetSphere<openvdb::FloatGrid>(radius, center, voxelSize, width);
         srcGrid->setName("Level set sphere at (" + std::to_string(i * 10) + ",0,0)");
-        handles.push_back(nanovdb::createNanoGrid(*srcGrid));
+        handles.push_back(nanovdb::tools::createNanoGrid(*srcGrid));
     }
     { // 18: add a double grid
         openvdb::DoubleGrid grid(0.0);
@@ -1751,7 +1777,7 @@ TEST_F(TestOpenVDB, MultiFile)
         grid.setGridClass(openvdb::GRID_FOG_VOLUME);
         grid.tree().setValue(openvdb::Coord(6000), 1.0);
         EXPECT_EQ(1u, grid.activeVoxelCount());
-        handles.push_back(nanovdb::createNanoGrid(grid));
+        handles.push_back(nanovdb::tools::createNanoGrid(grid));
     }
 
     nanovdb::io::writeGrids<nanovdb::HostBuffer, std::vector>("data/multi.nvdb", handles, this->getCodec());
@@ -1798,15 +1824,15 @@ TEST_F(TestOpenVDB, MultiFile)
         EXPECT_EQ(1u, tree.nodeCount(2));
         auto mgrHandle = nanovdb::createNodeManager(*grid);
         auto *mgr = mgrHandle.mgr<int32_t>();
-        EXPECT_TRUE(nanovdb::isValid(mgr));
+        EXPECT_TRUE(nanovdb::isAligned(mgr));
         const auto& leaf = mgr->leaf(0);
-        EXPECT_TRUE(nanovdb::isValid(&leaf));
+        EXPECT_TRUE(nanovdb::isAligned(&leaf));
         EXPECT_EQ(bbox, leaf.bbox());
         const auto& node1 = mgr->lower(0);
-        EXPECT_TRUE(nanovdb::isValid(&node1));
+        EXPECT_TRUE(nanovdb::isAligned(&node1));
         EXPECT_EQ(bbox, node1.bbox());
         const auto& node2 = mgr->upper(0);
-        EXPECT_TRUE(nanovdb::isValid(&node2));
+        EXPECT_TRUE(nanovdb::isAligned(&node2));
         EXPECT_EQ(bbox, node2.bbox());
         EXPECT_FALSE(grid->isLevelSet());
         EXPECT_FALSE(grid->isFogVolume());
@@ -2050,9 +2076,9 @@ TEST_F(TestOpenVDB, LongGridName)
         EXPECT_EQ(1u, srcGrid.activeVoxelCount());
         const bool isLong = length > limit;
 #if 1
-        auto handle = nanovdb::createNanoGrid(srcGrid);
+        auto handle = nanovdb::tools::createNanoGrid(srcGrid);
 #else
-        nanovdb::CreateNanoGrid<openvdb::FloatGrid> converter(srcGrid);
+        nanovdb::tools::CreateNanoGrid<openvdb::FloatGrid> converter(srcGrid);
         auto handle = converter.getHandle<float>();
 #endif
         auto* dstGrid = handle.grid<float>();
@@ -2092,8 +2118,8 @@ TEST_F(TestOpenVDB, LevelSetFiles)
             foundModels.push_back(fileName.substr(pos, fileName.size() - pos - 4 ));
 
             //mTimer.restart("Generating NanoVDB grid");
-            //auto handle = nanovdb::createNanoGrid(*srcGrid, nanovdb::StatsMode::All, nanovdb::ChecksumMode::Partial);
-            auto handle = nanovdb::createNanoGrid(*srcGrid, nanovdb::StatsMode::BBox, nanovdb::ChecksumMode::Disable);
+            //auto handle = nanovdb::tools::createNanoGrid(*srcGrid, nanovdb::tools::StatsMode::All, nanovdb::CheckMode::Partial);
+            auto handle = nanovdb::tools::createNanoGrid(*srcGrid, nanovdb::tools::StatsMode::BBox, nanovdb::CheckMode::Disable);
             //mTimer.restart("Writing NanoVDB grid");
 
             nanovdb::io::writeGrid(os, handle, this->getCodec());
@@ -2170,7 +2196,7 @@ TEST_F(TestOpenVDB, FogFiles)
             foundModels.push_back(fileName.substr(pos, fileName.size() - pos - 4 ));
 
             //mTimer.restart("Generating NanoVDB grid");
-            auto handle = nanovdb::createNanoGrid(*srcGrid, nanovdb::StatsMode::All, nanovdb::ChecksumMode::Partial);
+            auto handle = nanovdb::tools::createNanoGrid(*srcGrid, nanovdb::tools::StatsMode::All, nanovdb::CheckMode::Partial);
             //mTimer.restart("Writing NanoVDB grid");
             nanovdb::io::writeGrid(os, handle, this->getCodec());
 
@@ -2245,7 +2271,7 @@ TEST_F(TestOpenVDB, PointFiles)
             EXPECT_TRUE(positionIndex != openvdb::points::AttributeSet::INVALID_POS);
 
             //mTimer.restart("Generating NanoVDB grid from PointDataGrid");
-            auto handle = nanovdb::createNanoGrid(*srcGrid);
+            auto handle = nanovdb::tools::createNanoGrid(*srcGrid);
             //mTimer.restart("Writing NanoVDB grid");
             nanovdb::io::writeGrid(os, handle, this->getCodec());
 
@@ -2268,7 +2294,7 @@ TEST_F(TestOpenVDB, PointFiles)
                 // Create a read-only AttributeHandle. Position always uses Vec3f.
                 openvdb::points::AttributeHandle<openvdb::Vec3f> positionHandle(leafIter->constAttributeArray("P"));
                 openvdb::Coord ijkSrc(openvdb::Coord::min());
-                nanovdb::Coord ijkDst(nanovdb::Maximum<int>::value());
+                nanovdb::Coord ijkDst(nanovdb::math::Maximum<int>::value());
                 for (auto indexIter = leafIter->beginIndexOn(); indexIter; ++indexIter) {
                     // Extract the index-space position of the point relative to its occupying voxel ijk.
                     const openvdb::Vec3f vxlSrc = positionHandle.get(*indexIter);
@@ -2326,7 +2352,7 @@ TEST_F(TestOpenVDB, Trilinear)
         acc.setValue(ijk, trilinear(srcGrid->indexToWorld(ijk)));
     }
     //mTimer.restart("Generating NanoVDB grid");
-    auto handle = nanovdb::createNanoGrid(*srcGrid);
+    auto handle = nanovdb::tools::createNanoGrid(*srcGrid);
     //mTimer.restart("Writing NanoVDB grid");
     nanovdb::io::writeGrid("data/tmp.nvdb", handle);
     //mTimer.stop();
@@ -2349,11 +2375,11 @@ TEST_F(TestOpenVDB, Trilinear)
     //std::cerr << "Trilinear: exact = " << exact << ", approx = " << approx << std::endl;
 
     auto dstAcc = dstGrid->getAccessor();
-    auto sampler0 = nanovdb::createSampler<0>(dstAcc);
+    auto sampler0 = nanovdb::math::createSampler<0>(dstAcc);
     //std::cerr << "0'th order: v = " << sampler0(ijk) << std::endl;
     EXPECT_EQ(approx, sampler0(ijk));
 
-    auto sampler1 = nanovdb::createSampler<1>(dstAcc); // faster since it's using an accessor!!!
+    auto sampler1 = nanovdb::math::createSampler<1>(dstAcc); // faster since it's using an accessor!!!
     //std::cerr << "1'th order: v = " << sampler1(ijk) << std::endl;
     EXPECT_EQ(exact, sampler1(ijk));
 
@@ -2367,8 +2393,8 @@ TEST_F(TestOpenVDB, Trilinear)
     EXPECT_NEAR(6.7f, gradWorld[1], 1e-5);
     EXPECT_NEAR(-3.5f, gradWorld[2], 1e-5);
 
-    nanovdb::SampleFromVoxels<nanovdb::NanoTree<float>, 3> sampler3(dstGrid->tree());
-    //auto sampler3 = nanovdb::createSampler<3>( dstAcc );
+    nanovdb::math::SampleFromVoxels<nanovdb::NanoTree<float>, 3> sampler3(dstGrid->tree());
+    //auto sampler3 = nanovdb::math::createSampler<3>( dstAcc );
     //std::cerr << "3'rd order: v = " << sampler3(ijk) << std::endl;
     EXPECT_EQ(exact, sampler3(ijk));
 } // Trilinear
@@ -2392,7 +2418,7 @@ TEST_F(TestOpenVDB, Triquadratic)
         acc.setValue(ijk, triquadratic(srcGrid->indexToWorld(ijk)));
     }
     //mTimer.restart("Generating NanoVDB grid");
-    auto handle = nanovdb::createNanoGrid(*srcGrid);
+    auto handle = nanovdb::tools::createNanoGrid(*srcGrid);
     //mTimer.restart("Writing NanoVDB grid");
     nanovdb::io::writeGrid("data/tmp.nvdb", handle);
     //mTimer.stop();
@@ -2414,21 +2440,21 @@ TEST_F(TestOpenVDB, Triquadratic)
     //std::cerr << "Trilinear: exact = " << exact << ", approx = " << approx << std::endl;
     auto dstAcc = dstGrid->getAccessor();
 
-    auto sampler0 = nanovdb::createSampler<0>(dstAcc);
+    auto sampler0 = nanovdb::math::createSampler<0>(dstAcc);
     //std::cerr << "0'th order: v = " << sampler0(ijk) << std::endl;
     EXPECT_NEAR(approx, sampler0(ijk), 1e-6);
 
-    auto sampler1 = nanovdb::createSampler<1>(dstAcc);
+    auto sampler1 = nanovdb::math::createSampler<1>(dstAcc);
     //std::cerr << "1'rd order: nanovdb = " << sampler1(ijk) << ", openvdb: " << openvdb::tools::Sampler<1>::sample(srcGrid->tree(), ijk) << std::endl;
     EXPECT_NE(exact, sampler1(ijk)); // it's non-linear
     EXPECT_NEAR(sampler1(ijk), openvdb::tools::Sampler<1>::sample(srcGrid->tree(), ijk), 1e-6);
 
-    auto sampler2 = nanovdb::createSampler<2>(dstAcc);
+    auto sampler2 = nanovdb::math::createSampler<2>(dstAcc);
     //std::cerr << "2'rd order: nanovdb = " << sampler2(ijk) << ", openvdb: " << openvdb::tools::Sampler<2>::sample(srcGrid->tree(), ijk) << std::endl;
     EXPECT_NEAR(sampler2(ijk), openvdb::tools::Sampler<2>::sample(srcGrid->tree(), ijk), 1e-6);
     EXPECT_NEAR(exact, sampler2(ijk), 1e-5); // it's a 2nd order polynomial
 
-    auto sampler3 = nanovdb::createSampler<3>(dstAcc);
+    auto sampler3 = nanovdb::math::createSampler<3>(dstAcc);
     //std::cerr << "3'rd order: v = " << sampler3(ijk) << std::endl;
     EXPECT_NEAR(exact, sampler3(ijk), 1e-4); // it's a 2nd order polynomial
 } // Triquadratic
@@ -2451,7 +2477,7 @@ TEST_F(TestOpenVDB, Tricubic)
         acc.setValue(ijk, tricubic(srcGrid->indexToWorld(ijk)));
     }
     //mTimer.restart("Generating NanoVDB grid");
-    auto handle = nanovdb::createNanoGrid(*srcGrid);
+    auto handle = nanovdb::tools::createNanoGrid(*srcGrid);
     //mTimer.restart("Writing NanoVDB grid");
     nanovdb::io::writeGrid("data/tmp.nvdb", handle);
     //mTimer.stop();
@@ -2473,21 +2499,21 @@ TEST_F(TestOpenVDB, Tricubic)
     //std::cerr << "Trilinear: exact = " << exact << ", approx = " << approx << std::endl;
     auto dstAcc = dstGrid->getAccessor();
 
-    auto sampler0 = nanovdb::createSampler<0>(dstAcc);
+    auto sampler0 = nanovdb::math::createSampler<0>(dstAcc);
     //std::cerr << "0'th order: v = " << sampler0(ijk) << std::endl;
     EXPECT_NEAR(approx, sampler0(ijk), 1e-6);
 
-    auto sampler1 = nanovdb::createSampler<1>(dstAcc);
+    auto sampler1 = nanovdb::math::createSampler<1>(dstAcc);
     //std::cerr << "1'rd order: nanovdb = " << sampler1(ijk) << ", openvdb: " << openvdb::tools::Sampler<1>::sample(srcGrid->tree(), ijk) << std::endl;
     EXPECT_NE(exact, sampler1(ijk)); // it's non-linear
     EXPECT_NEAR(sampler1(ijk), openvdb::tools::Sampler<1>::sample(srcGrid->tree(), ijk), 1e-6);
 
-    auto sampler2 = nanovdb::createSampler<2>(dstAcc);
+    auto sampler2 = nanovdb::math::createSampler<2>(dstAcc);
     //std::cerr << "2'rd order: nanovdb = " << sampler2(ijk) << ", openvdb: " << openvdb::tools::Sampler<2>::sample(srcGrid->tree(), ijk) << std::endl;
     EXPECT_NEAR(sampler2(ijk), openvdb::tools::Sampler<2>::sample(srcGrid->tree(), ijk), 1e-6);
     EXPECT_NE(exact, sampler2(ijk)); // it's a 3nd order polynomial
 
-    auto sampler3 = nanovdb::createSampler<3>(dstAcc);
+    auto sampler3 = nanovdb::math::createSampler<3>(dstAcc);
     //std::cerr << "3'rd order: v = " << sampler3(ijk) << std::endl;
     EXPECT_NEAR(exact, sampler3(ijk), 1e-4); // it's a 3nd order polynomial
 } // Tricubic
@@ -2495,7 +2521,7 @@ TEST_F(TestOpenVDB, Tricubic)
 TEST_F(TestOpenVDB, GridValidator)
 {
     auto srcGrid = this->getSrcGrid();
-    auto handle = nanovdb::createNanoGrid(*srcGrid, nanovdb::StatsMode::All, nanovdb::ChecksumMode::Full);
+    auto handle = nanovdb::tools::createNanoGrid(*srcGrid, nanovdb::tools::StatsMode::All, nanovdb::CheckMode::Full);
     //mTimer.stop();
     EXPECT_TRUE(handle);
     EXPECT_TRUE(handle.data());
@@ -2503,34 +2529,34 @@ TEST_F(TestOpenVDB, GridValidator)
     EXPECT_TRUE(grid);
 
     //mTimer.start("isValid - detailed");
-    EXPECT_TRUE(nanovdb::isValid(*grid, true, true));
+    EXPECT_TRUE(nanovdb::tools::isValid(grid, nanovdb::CheckMode::Full, true));
     //mTimer.stop();
 
     //mTimer.start("isValid - not detailed");
-    EXPECT_TRUE(nanovdb::isValid(*grid, false, true));
+    EXPECT_TRUE(nanovdb::tools::isValid(grid, nanovdb::CheckMode::Partial, true));
     //mTimer.stop();
 
     //mTimer.start("Fast CRC");
-    auto fastChecksum = nanovdb::checksum(*grid, nanovdb::ChecksumMode::Full);
+    auto fastChecksum = nanovdb::tools::evalChecksum(grid, nanovdb::CheckMode::Full);
     //mTimer.stop();
-    EXPECT_EQ(fastChecksum, nanovdb::checksum(*grid, nanovdb::ChecksumMode::Full));
+    EXPECT_EQ(fastChecksum, nanovdb::tools::evalChecksum(grid, nanovdb::CheckMode::Full));
 
     auto* leaf = grid->tree().getFirstLeaf();
-    EXPECT_TRUE(nanovdb::isValid(leaf));
+    EXPECT_TRUE(nanovdb::isAligned(leaf));
     leaf->data()->mValues[512 >> 1] += 0.00001f; // slightly modify a single voxel value
 
-    EXPECT_NE(fastChecksum, nanovdb::checksum(*grid, nanovdb::ChecksumMode::Full));
-    EXPECT_FALSE(nanovdb::isValid(*grid, true, false));
+    EXPECT_NE(fastChecksum, nanovdb::tools::evalChecksum(grid, nanovdb::CheckMode::Full));
+    EXPECT_FALSE(nanovdb::tools::isValid(grid, nanovdb::CheckMode::Full, false));
 
     leaf->data()->mValues[512 >> 1] -= 0.00001f; // change back the single voxel value to it's original value
 
-    EXPECT_EQ(fastChecksum, nanovdb::checksum(*grid, nanovdb::ChecksumMode::Full));
-    EXPECT_TRUE(nanovdb::isValid(*grid, true, true));
+    EXPECT_EQ(fastChecksum, nanovdb::tools::evalChecksum(grid, nanovdb::CheckMode::Full));
+    EXPECT_TRUE(nanovdb::tools::isValid(grid, nanovdb::CheckMode::Full, true));
 
     leaf->data()->mValueMask.toggle(512 >> 1); // change a single bit in a value mask
 
-    EXPECT_NE(fastChecksum, nanovdb::checksum(*grid, nanovdb::ChecksumMode::Full));
-    EXPECT_FALSE(nanovdb::isValid(*grid, true, false));
+    EXPECT_NE(fastChecksum, nanovdb::tools::evalChecksum(grid, nanovdb::CheckMode::Full));
+    EXPECT_FALSE(nanovdb::tools::isValid(grid, nanovdb::CheckMode::Full, false));
 } // GridValidator
 
 TEST_F(TestOpenVDB, BenchmarkHostBuffer)
@@ -2564,8 +2590,8 @@ TEST_F(TestOpenVDB, DenseIndexGrid)
     // read openvdb::FloatGrid
     auto srcGrid = this->getSrcGrid(false, 0, 0);// level set of a dragon if available, else an octahedron
     auto& srcTree = srcGrid->tree();
-    nanovdb::CreateNanoGrid<openvdb::FloatGrid> builder(*srcGrid);
-    builder.setStats(nanovdb::StatsMode::All);
+    nanovdb::tools::CreateNanoGrid<openvdb::FloatGrid> builder(*srcGrid);
+    builder.setStats(nanovdb::tools::StatsMode::All);
     // openvdb::FloatGrid -> nanovdb::FloatGrid
     auto handle = builder.getHandle();
     EXPECT_TRUE(handle);
@@ -2601,7 +2627,7 @@ TEST_F(TestOpenVDB, DenseIndexGrid)
     }
     //mTimer.stop();
     auto *idxLeaf0 = idxGrid->tree().getFirstNode<0>();
-    nanovdb::forEach(nanovdb::Range1D(0,idxGrid->tree().nodeCount(0)),[&](const nanovdb::Range1D &r){
+    nanovdb::util::forEach(nanovdb::util::Range1D(0,idxGrid->tree().nodeCount(0)),[&](const nanovdb::util::Range1D &r){
         auto fltAcc = fltGrid->getAccessor();// NOT thread-safe!
         for (auto i=r.begin(); i!=r.end(); ++i){
             auto *idxLeaf = idxLeaf0 + i;
@@ -2623,7 +2649,7 @@ TEST_F(TestOpenVDB, SparseIndexGrid)
     auto srcGrid = this->getSrcGrid(false, 0, 0);// level set of a dragon if available, else an octahedron
 
     // openvdb::FloatGrid -> nanovdb::IndexGrid
-    nanovdb::CreateNanoGrid<openvdb::FloatGrid> builder(*srcGrid);
+    nanovdb::tools::CreateNanoGrid<openvdb::FloatGrid> builder(*srcGrid);
     //mTimer.start("Create IndexGrid");
     auto handle2 = builder.getHandle<nanovdb::ValueIndex>(1u, false, false);
     //mTimer.stop();
@@ -2651,25 +2677,25 @@ TEST_F(TestOpenVDB, SparseIndexGrid)
 TEST_F(TestOpenVDB, BuildNodeManager)
 {
     {// test NodeManager with build::Grid
-        using GridT = nanovdb::build::Grid<float>;
+        using GridT = nanovdb::tools::build::Grid<float>;
         GridT grid(0.0f);
-        nanovdb::build::NodeManager<GridT> mgr(grid);
+        nanovdb::tools::build::NodeManager<GridT> mgr(grid);
         using TreeT = GridT::TreeType;
-        static const bool test = nanovdb::is_same<nanovdb::NodeTrait<TreeT,0>::type, TreeT::LeafNodeType>::value;
+        static const bool test = nanovdb::util::is_same<nanovdb::NodeTrait<TreeT,0>::type, TreeT::LeafNodeType>::value;
         EXPECT_TRUE(test);
     }
     {// test NodeManager with openvdb::Grid
         using GridT = openvdb::FloatGrid;
         GridT grid(0.0f);
-        nanovdb::build::NodeManager<GridT> mgr(grid);
+        nanovdb::tools::build::NodeManager<GridT> mgr(grid);
         using TreeT = GridT::TreeType;
-        static const bool test = nanovdb::is_same<nanovdb::NodeTrait<TreeT,0>::type, TreeT::LeafNodeType>::value;
+        static const bool test = nanovdb::util::is_same<nanovdb::NodeTrait<TreeT,0>::type, TreeT::LeafNodeType>::value;
         EXPECT_TRUE(test);
     }
     {// test NodeTrait on nanovdb::Grid
         using GridT = nanovdb::NanoGrid<float>;
         using TreeT = GridT::TreeType;
-        static const bool test = nanovdb::is_same<nanovdb::NodeTrait<TreeT,0>::type, TreeT::LeafNodeType>::value;
+        static const bool test = nanovdb::util::is_same<nanovdb::NodeTrait<TreeT,0>::type, TreeT::LeafNodeType>::value;
         EXPECT_TRUE(test);
     }
 }// BuildNodeManager
@@ -2693,7 +2719,7 @@ TEST_F(TestOpenVDB, Benchmark_OpenVDB_PointIndexGrid)
 {
     const double voxelSize = 0.5;
 
-    nanovdb::CpuTimer timer("Generate sphere with points");
+    nanovdb::util::Timer timer("Generate sphere with points");
     auto pointsHandle = nanovdb::createPointSphere(8, 100.0, nanovdb::Vec3d(0.0), voxelSize);
     timer.stop();
 
@@ -2726,7 +2752,7 @@ TEST_F(TestOpenVDB, Benchmark_OpenVDB_PointDataGrid)
 {
     const double voxelSize = 0.5;
 
-    nanovdb::CpuTimer timer("Generate sphere with points");
+    nanovdb::util::Timer timer("Generate sphere with points");
     auto pointsHandle = nanovdb::createPointSphere(8, 100.0, nanovdb::Vec3d(0.0), voxelSize);
     timer.stop();
 
diff --git a/nanovdb/nanovdb/util/CpuTimer.h b/nanovdb/nanovdb/util/CpuTimer.h
index 44bf155287..af1ac90d77 100644
--- a/nanovdb/nanovdb/util/CpuTimer.h
+++ b/nanovdb/nanovdb/util/CpuTimer.h
@@ -1,83 +1,6 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-/// @file CpuTimer.h
-///
-/// @author Ken Museth
-///
-/// @brief A simple timing class (in case openvdb::util::CpuTimer is unavailable)
-
-#ifndef NANOVDB_CPU_TIMER_H_HAS_BEEN_INCLUDED
-#define NANOVDB_CPU_TIMER_H_HAS_BEEN_INCLUDED
-
-#include <iostream>
-#include <chrono>
-
-namespace nanovdb {
-
-class CpuTimer
-{
-    std::chrono::high_resolution_clock::time_point mStart;
-public:
-    /// @brief Default constructor
-    CpuTimer() {}
-
-    /// @brief Constructor that starts the timer
-    /// @param msg string message to be printed when timer is started
-    /// @param os output stream for the message above
-    CpuTimer(const std::string &msg, std::ostream& os = std::cerr) {this->start(msg, os);}
-
-    /// @brief Start the timer
-    /// @param msg string message to be printed when timer is started
-    /// @param os output stream for the message above
-    void start(const std::string &msg, std::ostream& os = std::cerr)
-    {
-        os << msg << " ... " << std::flush;
-        mStart = std::chrono::high_resolution_clock::now();
-    }
-
-    /// @brief elapsed time (since start) in miliseconds
-    template <typename AccuracyT = std::chrono::milliseconds>
-    auto elapsed()
-    {
-        auto end = std::chrono::high_resolution_clock::now();
-        return std::chrono::duration_cast<AccuracyT>(end - mStart).count();
-    }
-
-    /// @brief stop the timer
-    /// @tparam AccuracyT Template parameter defining the accuracy of the reported times
-    /// @param os output stream for the message above
-    template <typename AccuracyT = std::chrono::milliseconds>
-    void stop(std::ostream& os = std::cerr)
-    {
-        auto end = std::chrono::high_resolution_clock::now();
-        auto diff = std::chrono::duration_cast<AccuracyT>(end - mStart).count();
-        os << "completed in " << diff;
-        if (std::is_same<AccuracyT, std::chrono::microseconds>::value) {// resolved at compile-time
-            os << " microseconds" << std::endl;
-        } else if (std::is_same<AccuracyT, std::chrono::milliseconds>::value) {
-            os << " milliseconds" << std::endl;
-        } else if (std::is_same<AccuracyT, std::chrono::seconds>::value) {
-            os << " seconds" << std::endl;
-        } else {
-            os << " unknown time unit" << std::endl;
-        }
-    }
-
-    /// @brief stop and start the timer
-    /// @tparam AccuracyT Template parameter defining the accuracy of the reported times
-    /// @param msg string message to be printed when timer is started
-    /// @param os output stream for the message above
-    template <typename AccuracyT = std::chrono::milliseconds>
-    void restart(const std::string &msg, std::ostream& os = std::cerr)
-    {
-        this->stop<AccuracyT>();
-        this->start(msg, os);
-    }
-
-
-};// CpuTimer
-
-} // namespace nanovdb
-
-#endif // NANOVDB_CPU_TIMER_HAS_BEEN_INCLUDED
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/util/Timer.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/util/Timer.h instead.")
diff --git a/nanovdb/nanovdb/util/CreateNanoGrid.h b/nanovdb/nanovdb/util/CreateNanoGrid.h
index 7ad71c57d4..eeef8ab71b 100644
--- a/nanovdb/nanovdb/util/CreateNanoGrid.h
+++ b/nanovdb/nanovdb/util/CreateNanoGrid.h
@@ -1,2075 +1,6 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-/*!
-    \file CreateNanoGrid.h
-
-    \author Ken Museth
-
-    \date June 26, 2020
-
-    \note In the examples below we assume that @c srcGrid is a exiting grid of type
-          SrcGridT = @c openvdb::FloatGrid, @c openvdb::FloatGrid or @c nanovdb::build::FloatGrid.
-
-    \brief Convert any grid to a nanovdb grid of the same type, e.g. float->float
-    \code
-    auto handle = nanovdb::createNanoGrid(srcGrid);
-    auto *dstGrid = handle.grid<float>();
-    \endcode
-
-    \brief Convert a grid to a nanovdb grid of a different type, e.g. float->half
-    \code
-    auto handle = nanovdb::createNanoGrid<SrcGridT,nanovdb::Fp16>(srcGrid);
-    auto *dstGrid = handle.grid<nanovdb::Fp16>();
-    \endcode
-
-    \brief Convert a grid to a nanovdb grid of the same type but using a CUDA buffer
-    \code
-    auto handle = nanovdb::createNanoGrid<SrcGridT, float, nanovdb::CudaDeviceBuffer>(srcGrid);
-    auto *dstGrid = handle.grid<float>();
-    \endcode
-
-    \brief Create a nanovdb grid that indices values in an existing source grid of any type.
-           If DstBuildT = nanovdb::ValueIndex both active and in-active values are indexed
-           and if DstBuildT = nanovdb::ValueOnIndex only active values are indexed.
-    \code
-    using DstBuildT = nanovdb::ValueIndex;// index both active an inactive values
-    auto handle = nanovdb::createNanoGridSrcGridT,DstBuildT>(srcGrid,0,false,false);//no blind data, tile values or stats
-    auto *dstGrid = handle.grid<DstBuildT>();
-    \endcode
-
-    \brief Create a NanoVDB grid from scratch
-    \code
-#if defined(NANOVDB_USE_OPENVDB) && !defined(__CUDACC__)
-    using SrcGridT = openvdb::FloatGrid;
-#else
-    using SrcGridT = nanovdb::build::FloatGrid;
-#endif
-    SrcGridT srcGrid(0.0f);// create an empty source grid
-    auto srcAcc = srcGrid.getAccessor();// create an accessor
-    srcAcc.setValue(nanovdb::Coord(1,2,3), 1.0f);// set a voxel value
-
-    auto handle = nanovdb::createNanoGrid(srcGrid);// convert source grid to a grid handle
-    auto dstGrid = handle.grid<float>();// get a pointer to the destination grid
-    \endcode
-
-    \brief Convert a base-pointer to an openvdb grid, denoted srcGrid, to a  nanovdb
-           grid of the same type, e.g. float -> float or openvdb::Vec3f -> nanovdb::Vec3f
-    \code
-    auto handle = nanovdb::openToNanoVDB(*srcGrid);// convert source grid to a grid handle
-    auto dstGrid = handle.grid<float>();// get a pointer to the destination grid
-    \endcode
-
-    \brief Converts any existing grid to a NanoVDB grid, for example:
-           nanovdb::build::Grid<SrcBuildT> -> nanovdb::Grid<DstBuildT>
-           nanovdb::Grid<SrcBuildT> -> nanovdb::Grid<DstBuildT>
-           nanovdb::Grid<SrcBuildT> -> nanovdb::Grid<ValueIndex or ValueOnIndex>
-           openvdb::Grid<SrcBuildT> -> nanovdb::Grid<DstBuildT>
-           openvdb::Grid<PointIndex> -> nanovdb::Grid<PointIndex>
-           openvdb::Grid<PointData> -> nanovdb::Grid<PointData>
-           openvdb::Grid<SrcBuildT> -> nanovdb::Grid<ValueIndex or ValueOnIndex>
-
-    \note This files replaces GridBuilder.h, IndexGridBuilder.h and OpenToNanoVDB.h
-*/
-
-#ifndef NANOVDB_CREATE_NANOGRID_H_HAS_BEEN_INCLUDED
-#define NANOVDB_CREATE_NANOGRID_H_HAS_BEEN_INCLUDED
-
-#if defined(NANOVDB_USE_OPENVDB) && !defined(__CUDACC__)
-#include <openvdb/openvdb.h>
-#include <openvdb/points/PointDataGrid.h>
-#include <openvdb/tools/PointIndexGrid.h>
-#endif
-
-#include "GridBuilder.h"
-#include "NodeManager.h"
-#include "GridHandle.h"
-#include "GridStats.h"
-#include "GridChecksum.h"
-#include "Range.h"
-#include "Invoke.h"
-#include "ForEach.h"
-#include "Reduce.h"
-#include "PrefixSum.h"
-#include "DitherLUT.h"// for nanovdb::DitherLUT
-
-#include <limits>
-#include <vector>
-#include <set>
-#include <cstring> // for memcpy
-#include <type_traits>
-
-namespace nanovdb {
-
-// Forward declarations (defined below)
-template <typename> class CreateNanoGrid;
-class AbsDiff;
-template <typename> struct MapToNano;
-
-//================================================================================================
-
-#if defined(NANOVDB_USE_OPENVDB) && !defined(__CUDACC__)
-/// @brief Forward declaration of free-standing function that converts an OpenVDB GridBase into a NanoVDB GridHandle
-/// @tparam BufferT Type of the buffer used to allocate the destination grid
-/// @param base Shared pointer to a base openvdb grid to be converted
-/// @param sMode Mode for computing statistics of the destination grid
-/// @param cMode Mode for computing checksums of the destination grid
-/// @param verbose Mode of verbosity
-/// @return Handle to the destination NanoGrid
-template<typename BufferT = HostBuffer>
-GridHandle<BufferT>
-openToNanoVDB(const openvdb::GridBase::Ptr& base,
-              StatsMode                     sMode = StatsMode::Default,
-              ChecksumMode                  cMode = ChecksumMode::Default,
-              int                           verbose = 0);
-#endif
-
-//================================================================================================
-
-/// @brief Freestanding function that creates a NanoGrid<T> from any source grid
-/// @tparam SrcGridT Type of in input (source) grid, e.g. openvdb::Grid or nanovdb::Grid
-/// @tparam DstBuildT Type of values in the output (destination) nanovdb Grid, e.g. float or nanovdb::Fp16
-/// @tparam BufferT Type of the buffer used ti allocate the destination grid
-/// @param srcGrid Input (source) grid to be converted
-/// @param sMode  Mode for computing statistics of the destination grid
-/// @param cMode  Mode for computing checksums of the destination grid
-/// @param verbose Mode of verbosity
-/// @param buffer Instance of a buffer used for allocation
-/// @return Handle to the destination NanoGrid
-template<typename SrcGridT,
-         typename DstBuildT = typename MapToNano<typename SrcGridT::BuildType>::type,
-         typename BufferT = HostBuffer>
-typename disable_if<BuildTraits<DstBuildT>::is_index || BuildTraits<DstBuildT>::is_Fp, GridHandle<BufferT>>::type
-createNanoGrid(const SrcGridT &srcGrid,
-               StatsMode sMode = StatsMode::Default,
-               ChecksumMode cMode = ChecksumMode::Default,
-               int verbose = 0,
-               const BufferT &buffer = BufferT());
-
-//================================================================================================
-
-/// @brief Freestanding function that creates a NanoGrid<ValueIndex> or NanoGrid<ValueOnIndex> from any source grid
-/// @tparam SrcGridT Type of in input (source) grid, e.g. openvdb::Grid or nanovdb::Grid
-/// @tparam DstBuildT If ValueIndex all (active and inactive) values are indexed and if
-///         it is ValueOnIndex only active values are indexed.
-/// @tparam BufferT BufferT Type of the buffer used ti allocate the destination grid
-/// @param channels If non-zero the values (active or all) in @c srcGrid are encoded as blind
-///                 data in the output index grid. @c channels indicates the number of copies
-///                 of these blind data
-/// @param includeStats If true all tree nodes will includes indices for stats, i.e. min/max/avg/std-div
-/// @param includeTiles If false on values in leaf nodes are indexed
-/// @param verbose Mode of verbosity
-/// @param buffer Instance of a buffer used for allocation
-/// @return Handle to the destination NanoGrid<T> where T = ValueIndex or ValueOnIndex
-template<typename SrcGridT,
-         typename DstBuildT = typename MapToNano<typename SrcGridT::BuildType>::type,
-         typename BufferT = HostBuffer>
-typename enable_if<BuildTraits<DstBuildT>::is_index, GridHandle<BufferT>>::type
-createNanoGrid(const SrcGridT &srcGrid,
-               uint32_t channels = 0u,
-               bool includeStats = true,
-               bool includeTiles = true,
-               int verbose = 0,
-               const BufferT &buffer = BufferT());
-
-//================================================================================================
-
-/// @brief Freestanding function to create a NanoGrid<FpN> from any source grid
-/// @tparam SrcGridT Type of in input (source) grid, e.g. openvdb::Grid or nanovdb::Grid
-/// @tparam DstBuildT = FpN, i.e. variable bit-width of the output grid
-/// @tparam OracleT Type of the oracle used to determine the local bit-width, i.e. N in FpN
-/// @tparam BufferT Type of the buffer used to allocate the destination grid
-/// @param srcGrid Input (source) grid to be converted
-/// @param ditherOn switch to enable or disable dithering of quantization error
-/// @param sMode Mode for computing statistics of the destination grid
-/// @param cMode Mode for computing checksums of the destination grid
-/// @param verbose Mode of verbosity
-/// @param oracle Instance of a oracle used  to determine the local bit-width, i.e. N in FpN
-/// @param buffer Instance of a buffer used for allocation
-/// @return Handle to the destination NanoGrid
-template<typename SrcGridT,
-         typename DstBuildT = typename MapToNano<typename SrcGridT::BuildType>::type,
-         typename OracleT = AbsDiff,
-         typename BufferT = HostBuffer>
-typename enable_if<is_same<FpN, DstBuildT>::value, GridHandle<BufferT>>::type
-createNanoGrid(const SrcGridT &srcGrid,
-               StatsMode sMode = StatsMode::Default,
-               ChecksumMode cMode = ChecksumMode::Default,
-               bool ditherOn = false,
-               int verbose = 0,
-               const OracleT &oracle = OracleT(),
-               const BufferT &buffer = BufferT());
-
-//================================================================================================
-
-/// @brief Freestanding function to create a NanoGrid<FpX> from any source grid, X=4,8,16
-/// @tparam SrcGridT Type of in input (source) grid, e.g. openvdb::Grid or nanovdb::Grid
-/// @tparam DstBuildT = Fp4, Fp8 or Fp16, i.e. quantization bit-width of the output grid
-/// @tparam BufferT Type of the buffer used to allocate the destination grid
-/// @param srcGrid Input (source) grid to be converted
-/// @param ditherOn switch to enable or disable dithering of quantization error
-/// @param sMode Mode for computing statistics of the destination grid
-/// @param cMode Mode for computing checksums of the destination grid
-/// @param verbose Mode of verbosity
-/// @param buffer Instance of a buffer used for allocation
-/// @return Handle to the destination NanoGrid
-template<typename SrcGridT,
-         typename DstBuildT = typename MapToNano<typename SrcGridT::BuildType>::type,
-         typename BufferT = HostBuffer>
-typename enable_if<BuildTraits<DstBuildT>::is_FpX, GridHandle<BufferT>>::type
-createNanoGrid(const SrcGridT &srcGrid,
-               StatsMode sMode = StatsMode::Default,
-               ChecksumMode cMode = ChecksumMode::Default,
-               bool ditherOn = false,
-               int verbose = 0,
-               const BufferT &buffer = BufferT());
-
-//================================================================================================
-
-/// @brief Compression oracle based on absolute difference
-class AbsDiff
-{
-    float mTolerance;// absolute error tolerance
-public:
-    /// @note The default value of -1 means it's un-initialized!
-    AbsDiff(float tolerance = -1.0f) : mTolerance(tolerance) {}
-    AbsDiff(const AbsDiff&) = default;
-    ~AbsDiff() = default;
-    operator bool() const {return mTolerance>=0.0f;}
-    void init(nanovdb::GridClass gClass, float background) {
-        if (gClass == GridClass::LevelSet) {
-            static const float halfWidth = 3.0f;
-            mTolerance = 0.1f * background / halfWidth;// range of ls: [-3dx; 3dx]
-        } else if (gClass == GridClass::FogVolume) {
-            mTolerance = 0.01f;// range of FOG volumes: [0;1]
-        } else {
-            mTolerance = 0.0f;
-        }
-    }
-    void  setTolerance(float tolerance) { mTolerance = tolerance; }
-    float getTolerance() const { return mTolerance; }
-    /// @brief Return true if the approximate value is within the accepted
-    ///        absolute error bounds of the exact value.
-    ///
-    /// @details Required member method
-    bool  operator()(float exact, float approx) const
-    {
-        return Abs(exact - approx) <= mTolerance;
-    }
-};// AbsDiff
-
-inline std::ostream& operator<<(std::ostream& os, const AbsDiff& diff)
-{
-    os << "Absolute tolerance: " << diff.getTolerance();
-    return os;
-}
-
-//================================================================================================
-
-/// @brief Compression oracle based on relative difference
-class RelDiff
-{
-    float mTolerance;// relative error tolerance
-public:
-    /// @note The default value of -1 means it's un-initialized!
-    RelDiff(float tolerance = -1.0f) : mTolerance(tolerance) {}
-    RelDiff(const RelDiff&) = default;
-    ~RelDiff() = default;
-    operator bool() const {return mTolerance>=0.0f;}
-    void  setTolerance(float tolerance) { mTolerance = tolerance; }
-    float getTolerance() const { return mTolerance; }
-    /// @brief Return true if the approximate value is within the accepted
-    ///        relative error bounds of the exact value.
-    ///
-    /// @details Required member method
-    bool  operator()(float exact, float approx) const
-    {
-        return  Abs(exact - approx)/Max(Abs(exact), Abs(approx)) <= mTolerance;
-    }
-};// RelDiff
-
-inline std::ostream& operator<<(std::ostream& os, const RelDiff& diff)
-{
-    os << "Relative tolerance: " << diff.getTolerance();
-    return os;
-}
-
-//================================================================================================
-
-/// @brief The NodeAccessor provides a uniform API for accessing nodes got NanoVDB, OpenVDB and build Grids
-///
-/// @note General implementation that works with nanovdb::build::Grid
-template <typename GridT>
-class NodeAccessor
-{
-public:
-    static constexpr bool IS_OPENVDB = false;
-    static constexpr bool IS_NANOVDB = false;
-    using BuildType = typename GridT::BuildType;
-    using ValueType = typename GridT::ValueType;
-    using GridType = GridT;
-    using TreeType = typename GridT::TreeType;
-    using RootType = typename TreeType::RootNodeType;
-    template<int LEVEL>
-    using NodeType = typename NodeTrait<const TreeType, LEVEL>::type;
-    NodeAccessor(const GridT &grid) : mMgr(const_cast<GridT&>(grid)) {}
-    const GridType& grid() const {return mMgr.grid();}
-    const TreeType& tree() const {return mMgr.tree();}
-    const RootType& root() const {return mMgr.root();}
-    uint64_t nodeCount(int level) const { return mMgr.nodeCount(level); }
-    template <int LEVEL>
-    const NodeType<LEVEL>& node(uint32_t i) const {return mMgr.template node<LEVEL>(i); }
-    const std::string& getName() const {return this->grid().getName();};
-    bool hasLongGridName() const {return this->grid().getName().length() >= GridData::MaxNameSize;}
-    const nanovdb::Map& map() const {return this->grid().map();}
-    GridClass gridClass() const {return this->grid().gridClass();}
-private:
-    build::NodeManager<GridT> mMgr;
-};// NodeAccessor<GridT>
-
-//================================================================================================
-
-/// @brief Template specialization for nanovdb::Grid which is special since its NodeManage
-///         uses a handle in order to support node access on the GPU!
-template <typename BuildT>
-class NodeAccessor< NanoGrid<BuildT> >
-{
-public:
-    static constexpr bool IS_OPENVDB = false;
-    static constexpr bool IS_NANOVDB = true;
-    using BuildType = BuildT;
-    using BufferType = HostBuffer;
-    using GridType = NanoGrid<BuildT>;
-    using ValueType = typename GridType::ValueType;
-    using TreeType = typename GridType::TreeType;
-    using RootType = typename TreeType::RootType;
-    template<int LEVEL>
-    using NodeType = typename NodeTrait<TreeType, LEVEL>::type;
-    NodeAccessor(const GridType &grid)
-        : mHandle(createNodeManager<BuildT, BufferType>(grid))
-        , mMgr(*(mHandle.template mgr<BuildT>())) {}
-    const GridType& grid() const {return mMgr.grid();}
-    const TreeType& tree() const {return mMgr.tree();}
-    const RootType& root() const {return mMgr.root();}
-    uint64_t nodeCount(int level) const { return mMgr.nodeCount(level); }
-    template <int LEVEL>
-    const NodeType<LEVEL>& node(uint32_t i) const {return mMgr.template node<LEVEL>(i); }
-    std::string getName() const {return std::string(this->grid().gridName());};
-    bool hasLongGridName() const {return this->grid().hasLongGridName();}
-    const nanovdb::Map& map() const {return this->grid().map();}
-    GridClass gridClass() const {return this->grid().gridClass();}
-private:
-    NodeManagerHandle<BufferType> mHandle;
-    const NodeManager<BuildT>    &mMgr;
-};// NodeAccessor<nanovdb::Grid>
-
-//================================================================================================
-
-/// @brief Trait that maps any type to the corresponding nanovdb type
-/// @tparam T Type to be mapped
-template<typename T>
-struct MapToNano { using type = T; };
-
-#if defined(NANOVDB_USE_OPENVDB) && !defined(__CUDACC__)
-
-template<>
-struct MapToNano<openvdb::ValueMask> {using type = nanovdb::ValueMask;};
-template<typename T>
-struct MapToNano<openvdb::math::Vec3<T>>{using type = nanovdb::Vec3<T>;};
-template<typename T>
-struct MapToNano<openvdb::math::Vec4<T>>{using type = nanovdb::Vec4<T>;};
-template<>
-struct MapToNano<openvdb::PointIndex32> {using type = uint32_t;};
-template<>
-struct MapToNano<openvdb::PointDataIndex32> {using type = uint32_t;};
-
-/// Templated Grid with default 32->16->8 configuration
-template <typename BuildT>
-using OpenLeaf = openvdb::tree::LeafNode<BuildT,3>;
-template <typename BuildT>
-using OpenLower = openvdb::tree::InternalNode<OpenLeaf<BuildT>,4>;
-template <typename BuildT>
-using OpenUpper = openvdb::tree::InternalNode<OpenLower<BuildT>,5>;
-template <typename BuildT>
-using OpenRoot = openvdb::tree::RootNode<OpenUpper<BuildT>>;
-template <typename BuildT>
-using OpenTree = openvdb::tree::Tree<OpenRoot<BuildT>>;
-template <typename BuildT>
-using OpenGrid = openvdb::Grid<OpenTree<BuildT>>;
-
-//================================================================================================
-
-/// @brief Template specialization for openvdb::Grid
-template <typename BuildT>
-class NodeAccessor<OpenGrid<BuildT>>
-{
-public:
-    static constexpr bool IS_OPENVDB = true;
-    static constexpr bool IS_NANOVDB = false;
-    using BuildType = BuildT;
-    using GridType = OpenGrid<BuildT>;
-    using ValueType = typename GridType::ValueType;
-    using TreeType = OpenTree<BuildT>;
-    using RootType = OpenRoot<BuildT>;
-    template<int LEVEL>
-    using NodeType = typename NodeTrait<const TreeType, LEVEL>::type;
-    NodeAccessor(const GridType &grid) : mMgr(const_cast<GridType&>(grid)) {
-        const auto mat4 = this->grid().transform().baseMap()->getAffineMap()->getMat4();
-        mMap.set(mat4, mat4.inverse());
-    }
-    const GridType& grid() const {return mMgr.grid();}
-    const TreeType& tree() const {return mMgr.tree();}
-    const RootType& root() const {return mMgr.root();}
-    uint64_t nodeCount(int level) const { return mMgr.nodeCount(level); }
-    template <int LEVEL>
-    const NodeType<LEVEL>& node(uint32_t i) const {return mMgr.template node<LEVEL>(i); }
-    std::string getName() const { return this->grid().getName(); };
-    bool hasLongGridName() const {return this->grid().getName().length() >= GridData::MaxNameSize;}
-    const nanovdb::Map& map() const {return mMap;}
-    GridClass gridClass() const {
-        switch (this->grid().getGridClass()) {
-        case openvdb::GRID_LEVEL_SET:
-            if (!is_floating_point<BuildT>::value) OPENVDB_THROW(openvdb::ValueError, "processGrid: Level sets are expected to be floating point types");
-            return GridClass::LevelSet;
-        case openvdb::GRID_FOG_VOLUME:
-            return GridClass::FogVolume;
-        case openvdb::GRID_STAGGERED:
-            return GridClass::Staggered;
-        default:
-            return GridClass::Unknown;
-        }
-    }
-private:
-    build::NodeManager<GridType> mMgr;
-    nanovdb::Map                 mMap;
-};// NodeAccessor<openvdb::Grid<T>>
-
-//================================================================================================
-
-/// @brief Template specialization for openvdb::tools::PointIndexGrid
-template <>
-class NodeAccessor<openvdb::tools::PointIndexGrid>
-{
-public:
-    static constexpr bool IS_OPENVDB = true;
-    static constexpr bool IS_NANOVDB = false;
-    using BuildType = openvdb::PointIndex32;
-    using GridType = openvdb::tools::PointIndexGrid;
-    using TreeType = openvdb::tools::PointIndexTree;
-    using RootType = typename TreeType::RootNodeType;
-    using ValueType = typename GridType::ValueType;
-    template<int LEVEL>
-    using NodeType = typename NodeTrait<const TreeType, LEVEL>::type;
-    NodeAccessor(const GridType &grid) : mMgr(const_cast<GridType&>(grid)) {
-        const auto mat4 = this->grid().transform().baseMap()->getAffineMap()->getMat4();
-        mMap.set(mat4, mat4.inverse());
-    }
-    const GridType& grid() const {return mMgr.grid();}
-    const TreeType& tree() const {return mMgr.tree();}
-    const RootType& root() const {return mMgr.root();}
-    uint64_t nodeCount(int level) const { return mMgr.nodeCount(level); }
-    template <int LEVEL>
-    const NodeType<LEVEL>& node(uint32_t i) const {return mMgr.template node<LEVEL>(i); }
-    std::string getName() const { return this->grid().getName(); };
-    bool hasLongGridName() const {return this->grid().getName().length() >= GridData::MaxNameSize;}
-    const nanovdb::Map& map() const {return mMap;}
-    GridClass gridClass() const {return GridClass::PointIndex;}
-private:
-    build::NodeManager<GridType> mMgr;
-    nanovdb::Map                 mMap;
-};// NodeAccessor<openvdb::tools::PointIndexGrid>
-
-//================================================================================================
-
-// @brief Template specialization for openvdb::points::PointDataGrid
-template <>
-class NodeAccessor<openvdb::points::PointDataGrid>
-{
-public:
-    static constexpr bool IS_OPENVDB = true;
-    static constexpr bool IS_NANOVDB = false;
-    using BuildType = openvdb::PointDataIndex32;
-    using GridType = openvdb::points::PointDataGrid;
-    using TreeType = openvdb::points::PointDataTree;
-    using RootType = typename TreeType::RootNodeType;
-    using ValueType = typename GridType::ValueType;
-    template<int LEVEL>
-    using NodeType = typename NodeTrait<const TreeType, LEVEL>::type;
-    NodeAccessor(const GridType &grid) : mMgr(const_cast<GridType&>(grid)) {
-        const auto mat4 = this->grid().transform().baseMap()->getAffineMap()->getMat4();
-        mMap.set(mat4, mat4.inverse());
-    }
-    const GridType& grid() const {return mMgr.grid();}
-    const TreeType& tree() const {return mMgr.tree();}
-    const RootType& root() const {return mMgr.root();}
-    uint64_t nodeCount(int level) const { return mMgr.nodeCount(level); }
-    template <int LEVEL>
-    const NodeType<LEVEL>& node(uint32_t i) const {return mMgr.template node<LEVEL>(i); }
-    std::string getName() const { return this->grid().getName(); };
-    bool hasLongGridName() const {return this->grid().getName().length() >= GridData::MaxNameSize;}
-    const nanovdb::Map& map() const {return mMap;}
-    GridClass gridClass() const {return GridClass::PointData;}
-private:
-    build::NodeManager<GridType> mMgr;
-    nanovdb::Map                 mMap;
-};// NodeAccessor<openvdb::points::PointDataGrid>
-
-#endif// NANOVDB_USE_OPENVDB
-
-//================================================================================================
-
-/// @brief Creates any nanovdb Grid from any source grid (certain combinations are obviously not allowed)
-template <typename SrcGridT>
-class CreateNanoGrid
-{
-public:
-    // SrcGridT can be either openvdb::Grid, nanovdb::Grid or nanovdb::build::Grid
-    using SrcNodeAccT = NodeAccessor<SrcGridT>;
-    using SrcBuildT = typename SrcNodeAccT::BuildType;
-    using SrcValueT = typename SrcNodeAccT::ValueType;
-    using SrcTreeT  = typename SrcNodeAccT::TreeType;
-    using SrcRootT  = typename SrcNodeAccT::RootType;
-    template <int LEVEL>
-    using SrcNodeT = typename NodeTrait<SrcRootT, LEVEL>::type;
-
-    /// @brief Constructor from a source grid
-    /// @param srcGrid Source grid of type SrcGridT
-    CreateNanoGrid(const SrcGridT &srcGrid);
-
-    /// @brief Constructor from a source node accessor (defined above)
-    /// @param srcNodeAcc Source node accessor of type SrcNodeAccT
-    CreateNanoGrid(const SrcNodeAccT &srcNodeAcc);
-
-    /// @brief Set the level of verbosity
-    /// @param mode level of verbosity, mode=0 means quiet
-    void setVerbose(int mode = 1) { mVerbose = mode; }
-
-    /// @brief Enable or disable dithering, i.e. randomization of the quantization error.
-    /// @param on enable or disable dithering
-    /// @warning Dithering only has an affect when DstBuildT = {Fp4, Fp8, Fp16, FpN}
-    void enableDithering(bool on = true) { mDitherOn = on; }
-
-    /// @brief Set the mode used for computing statistics of the destination grid
-    /// @param mode specify the mode of statistics
-    void setStats(StatsMode mode = StatsMode::Default) { mStats = mode; }
-
-    /// @brief Set the mode used for computing checksums of the destination grid
-    /// @param mode specify the mode of checksum
-    void setChecksum(ChecksumMode mode = ChecksumMode::Default) { mChecksum = mode; }
-
-    /// @brief Converts the source grid into a nanovdb grid with the specified destination build type
-    /// @tparam DstBuildT build type of the destination, output, grid
-    /// @tparam BufferT Type of the buffer used for allocating the destination grid
-    /// @param buffer instance of the buffer use for allocation
-    /// @return Return an instance of a GridHandle (invoking move semantics)
-    /// @note This version is when DstBuildT != {FpN, ValueIndex, ValueOnIndex}
-    template<typename DstBuildT = typename MapToNano<SrcBuildT>::type, typename BufferT = HostBuffer>
-    typename disable_if<is_same<DstBuildT, FpN>::value ||
-                        BuildTraits<DstBuildT>::is_index, GridHandle<BufferT>>::type
-    getHandle(const BufferT &buffer = BufferT());
-
-    /// @brief Converts the source grid into a nanovdb grid with variable bit quantization
-    /// @tparam DstBuildT FpN, i.e. the destination grid uses variable bit quantization
-    /// @tparam OracleT Type of oracle used to determine the N in FpN
-    /// @tparam BufferT Type of the buffer used for allocating the destination grid
-    /// @param oracle Instance of the oracle used to determine the N in FpN
-    /// @param buffer instance of the buffer use for allocation
-    /// @return Return an instance of a GridHandle (invoking move semantics)
-    /// @note This version assumes DstBuildT == FpN
-    template<typename DstBuildT = typename MapToNano<SrcBuildT>::type, typename OracleT = AbsDiff, typename BufferT = HostBuffer>
-    typename enable_if<is_same<DstBuildT, FpN>::value, GridHandle<BufferT>>::type
-    getHandle(const OracleT &oracle = OracleT(),
-              const BufferT &buffer = BufferT());
-
-    /// @brief Converts the source grid into a nanovdb grid with indices to external arrays of values
-    /// @tparam DstBuildT ValueIndex or ValueOnIndex, i.e. index all or just active values
-    /// @tparam BufferT Type of the buffer used for allocating the destination grid
-    /// @param channels Number of copies of values encoded as blind data in the destination grid
-    /// @param includeStats Specify if statics should be indexed
-    /// @param includeTiles Specify if tile values, i.e. non-leaf-node-values, should be indexed
-    /// @param buffer instance of the buffer use for allocation
-    /// @return Return an instance of a GridHandle (invoking move semantics)
-    template<typename DstBuildT = typename MapToNano<SrcBuildT>::type, typename BufferT = HostBuffer>
-    typename enable_if<BuildTraits<DstBuildT>::is_index, GridHandle<BufferT>>::type
-    getHandle(uint32_t channels = 0u,
-              bool includeStats = true,
-              bool includeTiles = true,
-              const BufferT &buffer = BufferT());
-
-    /// @brief Add blind data to the destination grid
-    /// @param name String name of the blind data
-    /// @param dataSemantic Semantics of the blind data
-    /// @param dataClass Class of the blind data
-    /// @param dataType Type of the blind data
-    /// @param count Element count of the blind data
-    /// @param size Size of each element of the blind data
-    /// @return Return the index used to access the blind data
-    uint64_t addBlindData(const std::string& name,
-                          GridBlindDataSemantic dataSemantic,
-                          GridBlindDataClass dataClass,
-                          GridType dataType,
-                          size_t count, size_t size)
-    {
-        const size_t order = mBlindMetaData.size();
-        mBlindMetaData.emplace(name, dataSemantic, dataClass, dataType, order, count, size);
-        return order;
-    }
-
-    /// @brief This method only has affect when getHandle was called with DstBuildT = ValueIndex or ValueOnIndex
-    /// @return Return the number of indexed values. If called before getHandle was called with
-    ///         DstBuildT = ValueIndex or ValueOnIndex the return value is zero. Else it is a value larger than zero.
-    uint64_t valueCount() const {return mValIdx[0].empty() ? 0u : mValIdx[0].back();}
-
-    /// @brief Copy values from the source grid into a provided buffer
-    /// @tparam DstBuildT Must be ValueIndex or ValueOnIndex, i.e. a index grid
-    /// @param buffer point in which to write values
-    template <typename DstBuildT>
-    typename enable_if<BuildTraits<DstBuildT>::is_index>::type
-    copyValues(SrcValueT *buffer);
-
-private:
-
-    // =========================================================
-
-    template <typename T, int LEVEL>
-    typename enable_if<!(is_same<T,FpN>::value&&LEVEL==0), typename NodeTrait<NanoRoot<T>, LEVEL>::type*>::type
-    dstNode(uint64_t i) const {
-        static_assert(LEVEL==0 || LEVEL==1 || LEVEL==2, "Expected LEVEL== {0,1,2}");
-        using NodeT = typename NodeTrait<NanoRoot<T>, LEVEL>::type;
-        return PtrAdd<NodeT>(mBufferPtr, mOffset[5-LEVEL]) + i;
-    }
-    template <typename T, int LEVEL>
-    typename enable_if<is_same<T,FpN>::value && LEVEL==0, NanoLeaf<FpN>*>::type
-    dstNode(uint64_t i) const {return PtrAdd<NanoLeaf<FpN>>(mBufferPtr, mCodec[i].offset);}
-
-    template <typename T> NanoRoot<T>* dstRoot() const {return PtrAdd<NanoRoot<T>>(mBufferPtr, mOffset.root);}
-    template <typename T> NanoTree<T>* dstTree() const {return PtrAdd<NanoTree<T>>(mBufferPtr, mOffset.tree);}
-    template <typename T> NanoGrid<T>* dstGrid() const {return PtrAdd<NanoGrid<T>>(mBufferPtr, mOffset.grid);}
-    GridBlindMetaData* dstMeta(uint32_t i) const { return PtrAdd<GridBlindMetaData>(mBufferPtr, mOffset.meta) + i;};
-
-    // =========================================================
-
-    template <typename DstBuildT>
-    typename disable_if<is_same<FpN,DstBuildT>::value || BuildTraits<DstBuildT>::is_index>::type
-    preProcess();
-
-    template <typename DstBuildT>
-    typename enable_if<BuildTraits<DstBuildT>::is_index>::type
-    preProcess(uint32_t channels);
-
-    template <typename DstBuildT, typename OracleT>
-    typename enable_if<is_same<FpN, DstBuildT>::value>::type
-    preProcess(OracleT oracle);
-
-    // =========================================================
-
-    // Below are private methods use to serialize nodes into NanoVDB
-    template<typename DstBuildT, typename BufferT>
-    GridHandle<BufferT> initHandle(const BufferT& buffer);
-
-    // =========================================================
-
-    template <typename DstBuildT>
-    inline typename enable_if<BuildTraits<DstBuildT>::is_index>::type
-    postProcess(uint32_t channels);
-
-    template <typename DstBuildT>
-    inline typename disable_if<BuildTraits<DstBuildT>::is_index>::type
-    postProcess();
-
-    // ========================================================
-
-    template<typename DstBuildT>
-    typename disable_if<BuildTraits<DstBuildT>::is_special>::type
-    processLeafs();
-
-    template<typename DstBuildT>
-    typename enable_if<BuildTraits<DstBuildT>::is_index>::type
-    processLeafs();
-
-    template<typename DstBuildT>
-    typename enable_if<BuildTraits<DstBuildT>::is_FpX>::type
-    processLeafs();
-
-    template<typename DstBuildT>
-    typename enable_if<is_same<FpN, DstBuildT>::value>::type
-    processLeafs();
-
-    template<typename DstBuildT>
-    typename enable_if<is_same<bool, DstBuildT>::value>::type
-    processLeafs();
-
-    template<typename DstBuildT>
-    typename enable_if<is_same<ValueMask, DstBuildT>::value>::type
-    processLeafs();
-
-    // =========================================================
-
-    template<typename DstBuildT, int LEVEL>
-    typename enable_if<BuildTraits<DstBuildT>::is_index>::type
-    processInternalNodes();
-
-    template<typename DstBuildT, int LEVEL>
-    typename enable_if<!BuildTraits<DstBuildT>::is_index>::type
-    processInternalNodes();
-
-    // =========================================================
-
-    template <typename DstBuildT>
-    typename enable_if<BuildTraits<DstBuildT>::is_index>::type
-    processRoot();
-
-    template <typename DstBuildT>
-    typename enable_if<!BuildTraits<DstBuildT>::is_index>::type
-    processRoot();
-
-    // =========================================================
-
-    template<typename DstBuildT>
-    void processTree();
-
-    template<typename DstBuildT>
-    void processGrid();
-
-    template <typename DstBuildT, int LEVEL>
-    typename enable_if<BuildTraits<DstBuildT>::is_index, uint64_t>::type
-    countTileValues(uint64_t valueCount);
-
-    template <typename DstBuildT>
-    typename enable_if<BuildTraits<DstBuildT>::is_index, uint64_t>::type
-    countValues();
-
-#if defined(NANOVDB_USE_OPENVDB) && !defined(__CUDACC__)
-    template<typename T = SrcGridT>
-    typename disable_if<is_same<T, openvdb::tools::PointIndexGrid>::value ||
-                               is_same<T, openvdb::points::PointDataGrid>::value, uint64_t>::type
-    countPoints() const;
-
-    template<typename T = SrcGridT>
-    typename enable_if<is_same<T, openvdb::tools::PointIndexGrid>::value ||
-                       is_same<T, openvdb::points::PointDataGrid>::value, uint64_t>::type
-    countPoints() const;
-
-    template<typename DstBuildT, typename AttT, typename CodecT = openvdb::points::UnknownCodec, typename T = SrcGridT>
-    typename enable_if<is_same<openvdb::points::PointDataGrid, T>::value>::type
-    copyPointAttribute(size_t attIdx, AttT *attPtr);
-#else
-    uint64_t countPoints() const {return 0u;}
-#endif
-
-    uint8_t*                 mBufferPtr;// pointer to the beginning of the destination nanovdb grid buffer
-    struct BufferOffsets {
-        uint64_t grid, tree, root, upper, lower, leaf, meta, blind, size;
-        uint64_t operator[](int i) const { return *(reinterpret_cast<const uint64_t*>(this)+i); }
-    }                        mOffset;
-    int                      mVerbose;
-    uint64_t                 mLeafNodeSize;// non-trivial when DstBuiltT = FpN
-
-    std::unique_ptr<SrcNodeAccT> mSrcNodeAccPtr;// placeholder for potential local instance
-    const SrcNodeAccT       &mSrcNodeAcc;
-    struct BlindMetaData; // forward declaration
-    std::set<BlindMetaData>  mBlindMetaData; // sorted according to BlindMetaData.order
-    struct Codec { float min, max; uint64_t offset; uint8_t log2; };// used for adaptive bit-rate quantization
-    std::unique_ptr<Codec[]> mCodec;// defines a codec per leaf node when DstBuildT = FpN
-    StatsMode                mStats;
-    ChecksumMode             mChecksum;
-    bool                     mDitherOn, mIncludeStats, mIncludeTiles;
-    std::vector<uint64_t>    mValIdx[3];// store id of first value in node
-}; // CreateNanoGrid
-
-//================================================================================================
-
-template <typename SrcGridT>
-CreateNanoGrid<SrcGridT>::CreateNanoGrid(const SrcGridT &srcGrid)
-    : mVerbose(0)
-    , mSrcNodeAccPtr(new SrcNodeAccT(srcGrid))
-    , mSrcNodeAcc(*mSrcNodeAccPtr)
-    , mStats(StatsMode::Default)
-    , mChecksum(ChecksumMode::Default)
-    , mDitherOn(false)
-    , mIncludeStats(true)
-    , mIncludeTiles(true)
-{
-}
-
-//================================================================================================
-
-template <typename SrcGridT>
-CreateNanoGrid<SrcGridT>::CreateNanoGrid(const SrcNodeAccT &srcNodeAcc)
-    : mVerbose(0)
-    , mSrcNodeAccPtr(nullptr)
-    , mSrcNodeAcc(srcNodeAcc)
-    , mStats(StatsMode::Default)
-    , mChecksum(ChecksumMode::Default)
-    , mDitherOn(false)
-    , mIncludeStats(true)
-    , mIncludeTiles(true)
-{
-}
-
-//================================================================================================
-
-template <typename SrcGridT>
-struct CreateNanoGrid<SrcGridT>::BlindMetaData
-{
-    BlindMetaData(const std::string& name,// name + used to derive GridBlindDataSemantic
-                  const std::string& type,// used to derive GridType of blind data
-                  GridBlindDataClass dataClass,
-                  size_t i, size_t valueCount, size_t valueSize)
-        : metaData(reinterpret_cast<GridBlindMetaData*>(new char[sizeof(GridBlindMetaData)]))
-        , order(i)// sorted id of meta data
-        , size(AlignUp<NANOVDB_DATA_ALIGNMENT>(valueCount * valueSize))
-    {
-        std::memset(metaData, 0, sizeof(GridBlindMetaData));// zero out all meta data
-        if (name.length()>=GridData::MaxNameSize) throw std::runtime_error("blind data name exceeds limit");
-        std::memcpy(metaData->mName, name.c_str(), name.length() + 1);
-        metaData->mValueCount = valueCount;
-        metaData->mSemantic = BlindMetaData::mapToSemantics(name);
-        metaData->mDataClass = dataClass;
-        metaData->mDataType = BlindMetaData::mapToType(type);
-        metaData->mValueSize = valueSize;
-        NANOVDB_ASSERT(metaData->isValid());
-    }
-    BlindMetaData(const std::string& name,// only name
-                  GridBlindDataSemantic dataSemantic,
-                  GridBlindDataClass dataClass,
-                  GridType dataType,
-                  size_t i, size_t valueCount, size_t valueSize)
-        : metaData(reinterpret_cast<GridBlindMetaData*>(new char[sizeof(GridBlindMetaData)]))
-        , order(i)// sorted id of meta data
-        , size(AlignUp<NANOVDB_DATA_ALIGNMENT>(valueCount * valueSize))
-    {
-        std::memset(metaData, 0, sizeof(GridBlindMetaData));// zero out all meta data
-        if (name.length()>=GridData::MaxNameSize) throw std::runtime_error("blind data name exceeds character limit");
-        std::memcpy(metaData->mName, name.c_str(), name.length() + 1);
-        metaData->mValueCount = valueCount;
-        metaData->mSemantic = dataSemantic;
-        metaData->mDataClass = dataClass;
-        metaData->mDataType = dataType;
-        metaData->mValueSize = valueSize;
-        NANOVDB_ASSERT(metaData->isValid());
-    }
-    ~BlindMetaData(){ delete [] reinterpret_cast<char*>(metaData); }
-    bool operator<(const BlindMetaData& other) const { return order < other.order; } // required by std::set
-    static GridType mapToType(const std::string& name)
-    {
-        GridType type = GridType::Unknown;
-        if ("uint32_t" == name) {
-            type = GridType::UInt32;
-        } else if ("float" == name) {
-            type = GridType::Float;
-        } else if ("vec3s"== name) {
-            type = GridType::Vec3f;
-        } else if ("int32" == name) {
-            type = GridType::Int32;
-        } else if ("int64" == name) {
-            type = GridType::Int64;
-        }
-        return type;
-    }
-    static GridBlindDataSemantic mapToSemantics(const std::string& name)
-    {
-        GridBlindDataSemantic semantic = GridBlindDataSemantic::Unknown;
-        if ("P" == name) {
-            semantic = GridBlindDataSemantic::PointPosition;
-        } else if ("V" == name) {
-            semantic = GridBlindDataSemantic::PointVelocity;
-        } else if ("Cd" == name) {
-            semantic = GridBlindDataSemantic::PointColor;
-        } else if ("N" == name) {
-            semantic = GridBlindDataSemantic::PointNormal;
-        } else if ("id" == name) {
-            semantic = GridBlindDataSemantic::PointId;
-        }
-        return semantic;
-    }
-    GridBlindMetaData *metaData;
-    const size_t       order, size;
-}; // CreateNanoGrid::BlindMetaData
-
-//================================================================================================
-
-template <typename SrcGridT>
-template<typename DstBuildT, typename BufferT>
-typename disable_if<is_same<DstBuildT, FpN>::value ||
-                    BuildTraits<DstBuildT>::is_index, GridHandle<BufferT>>::type
-CreateNanoGrid<SrcGridT>::getHandle(const BufferT& pool)
-{
-    this->template preProcess<DstBuildT>();
-    auto handle = this->template initHandle<DstBuildT>(pool);
-    this->template postProcess<DstBuildT>();
-    return handle;
-} // CreateNanoGrid::getHandle<T>
-
-//================================================================================================
-
-template <typename SrcGridT>
-template<typename DstBuildT, typename OracleT, typename BufferT>
-typename enable_if<is_same<DstBuildT, FpN>::value, GridHandle<BufferT>>::type
-CreateNanoGrid<SrcGridT>::getHandle(const OracleT& oracle, const BufferT& pool)
-{
-    this->template preProcess<DstBuildT, OracleT>(oracle);
-    auto handle = this->template initHandle<DstBuildT>(pool);
-    this->template postProcess<DstBuildT>();
-    return handle;
-} // CreateNanoGrid::getHandle<FpN>
-
-//================================================================================================
-
-template <typename SrcGridT>
-template<typename DstBuildT, typename BufferT>
-typename enable_if<BuildTraits<DstBuildT>::is_index, GridHandle<BufferT>>::type
-CreateNanoGrid<SrcGridT>::getHandle(uint32_t channels,
-                                    bool includeStats,
-                                    bool includeTiles,
-                                    const BufferT &pool)
-{
-    mIncludeStats = includeStats;
-    mIncludeTiles = includeTiles;
-    this->template preProcess<DstBuildT>(channels);
-    auto handle = this->template initHandle<DstBuildT>(pool);
-    this->template postProcess<DstBuildT>(channels);
-    return handle;
-}// CreateNanoGrid::getHandle<ValueIndex or ValueOnIndex>
-
-//================================================================================================
-
-template <typename SrcGridT>
-template <typename DstBuildT, typename BufferT>
-GridHandle<BufferT> CreateNanoGrid<SrcGridT>::initHandle(const BufferT& pool)
-{
-    mOffset.grid  = 0;// grid is always stored at the start of the buffer!
-    mOffset.tree  = NanoGrid<DstBuildT>::memUsage(); // grid ends and tree begins
-    mOffset.root  = mOffset.tree  + NanoTree<DstBuildT>::memUsage(); // tree ends and root node begins
-    mOffset.upper = mOffset.root  + NanoRoot<DstBuildT>::memUsage(mSrcNodeAcc.root().getTableSize()); // root node ends and upper internal nodes begin
-    mOffset.lower = mOffset.upper + NanoUpper<DstBuildT>::memUsage()*mSrcNodeAcc.nodeCount(2); // upper internal nodes ends and lower internal nodes begin
-    mOffset.leaf  = mOffset.lower + NanoLower<DstBuildT>::memUsage()*mSrcNodeAcc.nodeCount(1); // lower internal nodes ends and leaf nodes begin
-    mOffset.meta  = mOffset.leaf  + mLeafNodeSize;// leaf nodes end and blind meta data begins
-    mOffset.blind = mOffset.meta  + sizeof(GridBlindMetaData)*mBlindMetaData.size(); // meta data ends and blind data begins
-    mOffset.size  = mOffset.blind;// end of buffer
-    for (const auto& b : mBlindMetaData) mOffset.size += b.size; // accumulate all the blind data
-
-    auto buffer = BufferT::create(mOffset.size, &pool);
-    mBufferPtr = buffer.data();
-
-    // Concurrent processing of all tree levels!
-    invoke( [&](){this->template processLeafs<DstBuildT>();},
-            [&](){this->template processInternalNodes<DstBuildT, 1>();},
-            [&](){this->template processInternalNodes<DstBuildT, 2>();},
-            [&](){this->template processRoot<DstBuildT>();},
-            [&](){this->template processTree<DstBuildT>();},
-            [&](){this->template processGrid<DstBuildT>();} );
-
-    return GridHandle<BufferT>(std::move(buffer));
-} // CreateNanoGrid::initHandle
-
-//================================================================================================
-
-template <typename SrcGridT>
-template <typename DstBuildT>
-inline typename disable_if<is_same<FpN, DstBuildT>::value || BuildTraits<DstBuildT>::is_index>::type
-CreateNanoGrid<SrcGridT>::preProcess()
-{
-    if (const uint64_t pointCount = this->countPoints()) {
-#if defined(NANOVDB_USE_OPENVDB) && !defined(__CUDACC__)
-        if constexpr(is_same<openvdb::tools::PointIndexGrid, SrcGridT>::value) {
-            if (!mBlindMetaData.empty()) throw std::runtime_error("expected no blind meta data");
-            this->addBlindData("index",
-                               GridBlindDataSemantic::PointId,
-                               GridBlindDataClass::IndexArray,
-                               GridType::UInt32,
-                               pointCount,
-                               sizeof(uint32_t));
-        } else if constexpr(is_same<openvdb::points::PointDataGrid, SrcGridT>::value) {
-            if (!mBlindMetaData.empty()) throw std::runtime_error("expected no blind meta data");
-            auto &srcLeaf = mSrcNodeAcc.template node<0>(0);
-            const auto& attributeSet = srcLeaf.attributeSet();
-            const auto& descriptor = attributeSet.descriptor();
-            const auto& nameMap = descriptor.map();
-            for (auto it = nameMap.begin(); it != nameMap.end(); ++it) {
-                const size_t index = it->second;
-                auto& attArray = srcLeaf.constAttributeArray(index);
-                mBlindMetaData.emplace(it->first, // name used to derive semantics
-                                       descriptor.valueType(index), // type
-                                       it->first == "id" ? GridBlindDataClass::IndexArray : GridBlindDataClass::AttributeArray, // class
-                                       index, // order
-                                       pointCount, // element count
-                                       attArray.valueTypeSize()); // element size
-            }
-        }
-#endif// end NANOVDB_USE_OPENVDB
-    }
-    if (mSrcNodeAcc.hasLongGridName()) {
-        this->addBlindData("grid name",
-                           GridBlindDataSemantic::Unknown,
-                           GridBlindDataClass::GridName,
-                           GridType::Unknown,
-                           mSrcNodeAcc.getName().length() + 1, 1);
-    }
-    mLeafNodeSize = mSrcNodeAcc.nodeCount(0)*NanoLeaf<DstBuildT>::DataType::memUsage();
-}// CreateNanoGrid::preProcess<T>
-
-//================================================================================================
-
-template <typename SrcGridT>
-template <typename DstBuildT, typename OracleT>
-inline typename enable_if<is_same<FpN, DstBuildT>::value>::type
-CreateNanoGrid<SrcGridT>::preProcess(OracleT oracle)
-{
-    static_assert(is_same<float, SrcValueT>::value, "preProcess<FpN>: expected SrcValueT == float");
-
-    const size_t leafCount = mSrcNodeAcc.nodeCount(0);
-    if (leafCount==0) {
-        mLeafNodeSize = 0u;
-        return;
-    }
-    mCodec.reset(new Codec[leafCount]);
-
-    if constexpr(is_same<AbsDiff, OracleT>::value) {
-        if (!oracle) oracle.init(mSrcNodeAcc.gridClass(), mSrcNodeAcc.root().background());
-    }
-
-    DitherLUT lut(mDitherOn);
-    forEach(0, leafCount, 4, [&](const Range1D &r) {
-        for (auto i=r.begin(); i!=r.end(); ++i) {
-            const auto &srcLeaf = mSrcNodeAcc.template node<0>(i);
-            float &min = mCodec[i].min = std::numeric_limits<float>::max();
-            float &max = mCodec[i].max = -min;
-            for (int j=0; j<512; ++j) {
-                float v = srcLeaf.getValue(j);
-                if (v<min) min = v;
-                if (v>max) max = v;
-            }
-            const float range = max - min;
-            uint8_t &logBitWidth = mCodec[i].log2 = 0;// 0,1,2,3,4 => 1,2,4,8,16 bits
-            while (range > 0.0f && logBitWidth < 4u) {
-                const uint32_t mask = (uint32_t(1) << (uint32_t(1) << logBitWidth)) - 1u;
-                const float encode  = mask/range;
-                const float decode  = range/mask;
-                int j = 0;
-                do {
-                    const float exact = srcLeaf.getValue(j);//data[j];// exact value
-                    const uint32_t code = uint32_t(encode*(exact - min) + lut(j));
-                    const float approx = code * decode + min;// approximate value
-                    j += oracle(exact, approx) ? 1 : 513;
-                } while(j < 512);
-                if (j == 512) break;
-                ++logBitWidth;
-            }
-        }
-    });
-
-    auto getOffset = [&](size_t i){
-        --i;
-        return mCodec[i].offset +  NanoLeaf<DstBuildT>::DataType::memUsage(1u << mCodec[i].log2);
-    };
-    mCodec[0].offset = NanoGrid<FpN>::memUsage() +
-                       NanoTree<FpN>::memUsage() +
-                       NanoRoot<FpN>::memUsage(mSrcNodeAcc.root().getTableSize()) +
-                       NanoUpper<FpN>::memUsage()*mSrcNodeAcc.nodeCount(2) +
-                       NanoLower<FpN>::memUsage()*mSrcNodeAcc.nodeCount(1);
-    for (size_t i=1; i<leafCount; ++i) mCodec[i].offset = getOffset(i);
-    mLeafNodeSize = getOffset(leafCount);
-
-    if (mVerbose) {
-        uint32_t counters[5+1] = {0};
-        ++counters[mCodec[0].log2];
-        for (size_t i=1; i<leafCount; ++i) ++counters[mCodec[i].log2];
-        std::cout << "\n" << oracle << std::endl;
-        std::cout << "Dithering: " << (mDitherOn ? "enabled" : "disabled") << std::endl;
-        float avg = 0.0f;
-        for (uint32_t i=0; i<=5; ++i) {
-            if (uint32_t n = counters[i]) {
-                avg += n * float(1 << i);
-                printf("%2i bits: %6u leaf nodes, i.e. %4.1f%%\n",1<<i, n, 100.0f*n/float(leafCount));
-            }
-        }
-        printf("%4.1f bits per value on average\n", avg/float(leafCount));
-    }
-
-    if (mSrcNodeAcc.hasLongGridName()) {
-        this->addBlindData("grid name",
-                           GridBlindDataSemantic::Unknown,
-                           GridBlindDataClass::GridName,
-                           GridType::Unknown,
-                           mSrcNodeAcc.getName().length() + 1, 1);
-    }
-}// CreateNanoGrid::preProcess<FpN>
-
-//================================================================================================
-
-template <typename SrcGridT>
-template <typename DstBuildT, int LEVEL>
-inline typename enable_if<BuildTraits<DstBuildT>::is_index, uint64_t>::type
-CreateNanoGrid<SrcGridT>::countTileValues(uint64_t valueCount)
-{
-    const uint64_t stats = mIncludeStats ? 4u : 0u;// minimum, maximum, average, and deviation
-    mValIdx[LEVEL].clear();
-    mValIdx[LEVEL].resize(mSrcNodeAcc.nodeCount(LEVEL) + 1, stats);// minimum 1 entry
-    forEach(1, mValIdx[LEVEL].size(), 8, [&](const Range1D& r){
-        for (auto i = r.begin(); i!=r.end(); ++i) {
-            auto &srcNode = mSrcNodeAcc.template node<LEVEL>(i-1);
-            if constexpr(BuildTraits<DstBuildT>::is_onindex) {// resolved at compile time
-                mValIdx[LEVEL][i] += srcNode.getValueMask().countOn();
-            } else {
-                static const uint64_t maxTileCount = uint64_t(1u) << 3*srcNode.LOG2DIM;
-                mValIdx[LEVEL][i] += maxTileCount - srcNode.getChildMask().countOn();
-            }
-        }
-    });
-    mValIdx[LEVEL][0] = valueCount;
-    for (size_t i=1; i<mValIdx[LEVEL].size(); ++i) mValIdx[LEVEL][i] += mValIdx[LEVEL][i-1];// pre-fixed sum
-    return mValIdx[LEVEL].back();
-}// CreateNanoGrid::countTileValues<ValueIndex or ValueOnIndex>
-
-//================================================================================================
-
-template <typename SrcGridT>
-template <typename DstBuildT>
-inline typename enable_if<BuildTraits<DstBuildT>::is_index, uint64_t>::type
-CreateNanoGrid<SrcGridT>::countValues()
-{
-    const uint64_t stats = mIncludeStats ? 4u : 0u;// minimum, maximum, average, and deviation
-    uint64_t valueCount = 1u;// offset 0 corresponds to the background value
-    if (mIncludeTiles) {
-        if constexpr(BuildTraits<DstBuildT>::is_onindex) {
-            for (auto it = mSrcNodeAcc.root().cbeginValueOn(); it; ++it) ++valueCount;
-        } else {
-            for (auto it = mSrcNodeAcc.root().cbeginValueAll(); it; ++it) ++valueCount;
-        }
-        valueCount += stats;// optionally append stats for the root node
-        valueCount = countTileValues<DstBuildT, 2>(valueCount);
-        valueCount = countTileValues<DstBuildT, 1>(valueCount);
-    }
-    mValIdx[0].clear();
-    mValIdx[0].resize(mSrcNodeAcc.nodeCount(0) + 1, 512u + stats);// minimum 1 entry
-    if constexpr(BuildTraits<DstBuildT>::is_onindex) {
-        forEach(1, mValIdx[0].size(), 8, [&](const Range1D& r) {
-            for (auto i = r.begin(); i != r.end(); ++i) {
-                mValIdx[0][i] = stats;
-                mValIdx[0][i] += mSrcNodeAcc.template node<0>(i-1).getValueMask().countOn();
-            }
-        });
-    }
-    mValIdx[0][0] = valueCount;
-    prefixSum(mValIdx[0], true);// inclusive prefix sum
-    return mValIdx[0].back();
-}// CreateNanoGrid::countValues<ValueIndex or ValueOnIndex>()
-
-//================================================================================================
-
-template <typename SrcGridT>
-template <typename DstBuildT>
-inline typename enable_if<BuildTraits<DstBuildT>::is_index>::type
-CreateNanoGrid<SrcGridT>::preProcess(uint32_t channels)
-{
-    const uint64_t valueCount = this->template countValues<DstBuildT>();
-    mLeafNodeSize = mSrcNodeAcc.nodeCount(0)*NanoLeaf<DstBuildT>::DataType::memUsage();
-
-    uint32_t order = mBlindMetaData.size();
-    for (uint32_t i=0; i<channels; ++i) {
-        mBlindMetaData.emplace("channel_"+std::to_string(i),
-                               toStr(mapToGridType<SrcValueT>()),
-                               GridBlindDataClass::AttributeArray,
-                               order++,
-                               valueCount,
-                               sizeof(SrcValueT));
-    }
-    if (mSrcNodeAcc.hasLongGridName()) {
-        this->addBlindData("grid name",
-                           GridBlindDataSemantic::Unknown,
-                           GridBlindDataClass::GridName,
-                           GridType::Unknown,
-                           mSrcNodeAcc.getName().length() + 1, 1);
-    }
-}// preProcess<ValueIndex or ValueOnIndex>
-
-//================================================================================================
-
-template <typename SrcGridT>
-template <typename DstBuildT>
-inline typename disable_if<BuildTraits<DstBuildT>::is_special>::type
-CreateNanoGrid<SrcGridT>::processLeafs()
-{
-    using DstDataT  = typename NanoLeaf<DstBuildT>::DataType;
-    using DstValueT = typename DstDataT::ValueType;
-    static_assert(DstDataT::FIXED_SIZE, "Expected destination LeafNode<T> to have fixed size");
-    forEach(0, mSrcNodeAcc.nodeCount(0), 8, [&](const Range1D& r) {
-        auto *dstData = this->template dstNode<DstBuildT,0>(r.begin())->data();
-        for (auto i = r.begin(); i != r.end(); ++i, ++dstData) {
-            auto &srcLeaf = mSrcNodeAcc.template node<0>(i);
-            if (DstDataT::padding()>0u) {
-                // Cast to void* to avoid compiler warning about missing trivial copy-assignment
-                std::memset(reinterpret_cast<void*>(dstData), 0, DstDataT::memUsage());
-            } else {
-                dstData->mBBoxDif[0] = dstData->mBBoxDif[1] = dstData->mBBoxDif[2] = 0u;
-                dstData->mFlags = 0u;// enable rendering, no bbox, no stats
-                dstData->mMinimum = dstData->mMaximum = typename DstDataT::ValueType();
-                dstData->mAverage = dstData->mStdDevi = 0;
-            }
-            dstData->mBBoxMin = srcLeaf.origin(); // copy origin of node
-            dstData->mValueMask = srcLeaf.getValueMask(); // copy value mask
-            DstValueT *dst = dstData->mValues;
-            if constexpr(is_same<DstValueT, SrcValueT>::value && SrcNodeAccT::IS_OPENVDB) {
-                const SrcValueT *src = srcLeaf.buffer().data();
-                for (auto *end = dst + 512u; dst != end; dst += 4, src += 4) {
-                    dst[0] = src[0]; // copy *all* voxel values in sets of four, i.e. loop-unrolling
-                    dst[1] = src[1];
-                    dst[2] = src[2];
-                    dst[3] = src[3];
-                }
-            } else {
-                for (uint32_t j=0; j<512u; ++j) *dst++ = static_cast<DstValueT>(srcLeaf.getValue(j));
-            }
-        }
-    });
-} // CreateNanoGrid::processLeafs<T>
-
-//================================================================================================
-
-template <typename SrcGridT>
-template <typename DstBuildT>
-inline typename enable_if<BuildTraits<DstBuildT>::is_index>::type
-CreateNanoGrid<SrcGridT>::processLeafs()
-{
-    using DstDataT  = typename NanoLeaf<DstBuildT>::DataType;
-    static_assert(DstDataT::FIXED_SIZE, "Expected destination LeafNode<ValueIndex> to have fixed size");
-    static_assert(DstDataT::padding()==0u, "Expected leaf nodes to have no padding");
-
-    forEach(0, mSrcNodeAcc.nodeCount(0), 8, [&](const Range1D& r) {
-        const uint8_t flags  = mIncludeStats ? 16u : 0u;// 4th bit indicates stats
-        DstDataT *dstData = this->template dstNode<DstBuildT,0>(r.begin())->data();// fixed size
-        for (auto i = r.begin(); i != r.end(); ++i, ++dstData) {
-            auto &srcLeaf = mSrcNodeAcc.template node<0>(i);
-            dstData->mBBoxMin = srcLeaf.origin(); // copy origin of node
-            dstData->mBBoxDif[0] = dstData->mBBoxDif[1] = dstData->mBBoxDif[2] = 0u;
-            dstData->mFlags = flags;
-            dstData->mValueMask = srcLeaf.getValueMask(); // copy value mask
-            dstData->mOffset = mValIdx[0][i];
-            if constexpr(BuildTraits<DstBuildT>::is_onindex) {
-                const uint64_t *w = dstData->mValueMask.words();
-#ifdef USE_OLD_VALUE_ON_INDEX
-                int32_t sum = CountOn(*w++);
-                uint8_t *p = reinterpret_cast<uint8_t*>(&dstData->mPrefixSum), *q = p + 7;
-                for (int j=0; j<7; ++j) {
-                    *p++ = sum & 255u;
-                    *q |= (sum >> 8) << j;
-                    sum += CountOn(*w++);
-                }
-#else
-                uint64_t &prefixSum = dstData->mPrefixSum, sum = CountOn(*w++);
-                prefixSum = sum;
-                for (int n = 9; n < 55; n += 9) {// n=i*9 where i=1,2,..6
-                    sum += CountOn(*w++);
-                    prefixSum |= sum << n;// each pre-fixed sum is encoded in 9 bits
-                }
-#endif
-            } else {
-                dstData->mPrefixSum = 0u;
-            }
-            if constexpr(BuildTraits<DstBuildT>::is_indexmask) dstData->mMask = dstData->mValueMask;
-        }
-    });
-} // CreateNanoGrid::processLeafs<ValueIndex or ValueOnIndex>
-
-//================================================================================================
-
-template <typename SrcGridT>
-template <typename DstBuildT>
-inline typename enable_if<is_same<ValueMask, DstBuildT>::value>::type
-CreateNanoGrid<SrcGridT>::processLeafs()
-{
-    using DstDataT = typename NanoLeaf<ValueMask>::DataType;
-    static_assert(DstDataT::FIXED_SIZE, "Expected destination LeafNode<ValueMask> to have fixed size");
-    forEach(0, mSrcNodeAcc.nodeCount(0), 8, [&](const Range1D& r) {
-        auto *dstData = this->template dstNode<DstBuildT,0>(r.begin())->data();
-        for (auto i = r.begin(); i != r.end(); ++i, ++dstData) {
-            auto &srcLeaf = mSrcNodeAcc.template node<0>(i);
-            if (DstDataT::padding()>0u) {
-                // Cast to void* to avoid compiler warning about missing trivial copy-assignment
-                std::memset(reinterpret_cast<void*>(dstData), 0, DstDataT::memUsage());
-            } else {
-                dstData->mBBoxDif[0] = dstData->mBBoxDif[1] = dstData->mBBoxDif[2] = 0u;
-                dstData->mFlags = 0u;// enable rendering, no bbox, no stats
-                dstData->mPadding[0] = dstData->mPadding[1] = 0u;
-            }
-            dstData->mBBoxMin = srcLeaf.origin(); // copy origin of node
-            dstData->mValueMask = srcLeaf.getValueMask(); // copy value mask
-        }
-    });
-} // CreateNanoGrid::processLeafs<ValueMask>
-
-//================================================================================================
-
-template <typename SrcGridT>
-template <typename DstBuildT>
-inline typename enable_if<is_same<bool, DstBuildT>::value>::type
-CreateNanoGrid<SrcGridT>::processLeafs()
-{
-    using DstDataT = typename NanoLeaf<bool>::DataType;
-    static_assert(DstDataT::FIXED_SIZE, "Expected destination LeafNode<bool> to have fixed size");
-    forEach(0, mSrcNodeAcc.nodeCount(0), 8, [&](const Range1D& r) {
-        auto *dstData = this->template dstNode<DstBuildT,0>(r.begin())->data();
-        for (auto i = r.begin(); i != r.end(); ++i, ++dstData) {
-            auto &srcLeaf = mSrcNodeAcc.template node<0>(i);
-            if (DstDataT::padding()>0u) {
-                // Cast to void* to avoid compiler warning about missing trivial copy-assignment
-                std::memset(reinterpret_cast<void*>(dstData), 0, DstDataT::memUsage());
-            } else {
-                dstData->mBBoxDif[0] = dstData->mBBoxDif[1] = dstData->mBBoxDif[2] = 0u;
-                dstData->mFlags = 0u;// enable rendering, no bbox, no stats
-            }
-            dstData->mBBoxMin = srcLeaf.origin(); // copy origin of node
-            dstData->mValueMask = srcLeaf.getValueMask(); // copy value mask
-            if constexpr(!is_same<bool, SrcBuildT>::value) {
-                for (int j=0; j<512; ++j) dstData->mValues.set(j, static_cast<bool>(srcLeaf.getValue(j)));
-            } else if constexpr(SrcNodeAccT::IS_OPENVDB) {
-                dstData->mValues = *reinterpret_cast<const Mask<3>*>(srcLeaf.buffer().data());
-            } else if constexpr(SrcNodeAccT::IS_NANOVDB) {
-                dstData->mValues = srcLeaf.data()->mValues;
-            } else {// build::Leaf
-                dstData->mValues = srcLeaf.mValues; // copy value mask
-            }
-        }
-    });
-} // CreateNanoGrid::processLeafs<bool>
-
-//================================================================================================
-
-template <typename SrcGridT>
-template <typename DstBuildT>
-inline typename enable_if<BuildTraits<DstBuildT>::is_FpX>::type
-CreateNanoGrid<SrcGridT>::processLeafs()
-{
-    using DstDataT = typename NanoLeaf<DstBuildT>::DataType;
-    static_assert(DstDataT::FIXED_SIZE, "Expected destination LeafNode<Fp4|Fp8|Fp16> to have fixed size");
-    using ArrayT = typename DstDataT::ArrayType;
-    static_assert(is_same<float, SrcValueT>::value, "Expected ValueT == float");
-    using FloatT = typename std::conditional<DstDataT::bitWidth()>=16, double, float>::type;// 16 compression and higher requires double
-    static constexpr FloatT UNITS = FloatT((1 << DstDataT::bitWidth()) - 1);// # of unique non-zero values
-    DitherLUT lut(mDitherOn);
-
-    forEach(0, mSrcNodeAcc.nodeCount(0), 8, [&](const Range1D& r) {
-        auto *dstData = this->template dstNode<DstBuildT,0>(r.begin())->data();
-        for (auto i = r.begin(); i != r.end(); ++i, ++dstData) {
-            auto &srcLeaf = mSrcNodeAcc.template node<0>(i);
-            if (DstDataT::padding()>0u) {
-                // Cast to void* to avoid compiler warning about missing trivial copy-assignment
-                std::memset(reinterpret_cast<void*>(dstData), 0, DstDataT::memUsage());
-            } else {
-                dstData->mFlags = dstData->mBBoxDif[2] = dstData->mBBoxDif[1] = dstData->mBBoxDif[0] = 0u;
-                dstData->mDev = dstData->mAvg = dstData->mMax = dstData->mMin = 0u;
-            }
-            dstData->mBBoxMin = srcLeaf.origin(); // copy origin of node
-            dstData->mValueMask = srcLeaf.getValueMask(); // copy value mask
-            // compute extrema values
-            float min = std::numeric_limits<float>::max(), max = -min;
-            for (uint32_t j=0; j<512u; ++j) {
-                const float v = srcLeaf.getValue(j);
-                if (v < min) min = v;
-                if (v > max) max = v;
-            }
-            dstData->init(min, max, DstDataT::bitWidth());
-            // perform quantization relative to the values in the current leaf node
-            const FloatT encode = UNITS/(max-min);
-            uint32_t offset = 0;
-            auto quantize = [&]()->ArrayT{
-                const ArrayT tmp = static_cast<ArrayT>(encode * (srcLeaf.getValue(offset) - min) + lut(offset));
-                ++offset;
-                return tmp;
-            };
-            auto *code = reinterpret_cast<ArrayT*>(dstData->mCode);
-            if (is_same<Fp4, DstBuildT>::value) {// resolved at compile-time
-                for (uint32_t j=0; j<128u; ++j) {
-                    auto tmp = quantize();
-                    *code++  = quantize() << 4 | tmp;
-                    tmp      = quantize();
-                    *code++  = quantize() << 4 | tmp;
-                }
-            } else {
-                for (uint32_t j=0; j<128u; ++j) {
-                    *code++ = quantize();
-                    *code++ = quantize();
-                    *code++ = quantize();
-                    *code++ = quantize();
-                }
-            }
-        }
-    });
-} // CreateNanoGrid::processLeafs<Fp4, Fp8, Fp16>
-
-//================================================================================================
-
-template <typename SrcGridT>
-template <typename DstBuildT>
-inline typename enable_if<is_same<FpN, DstBuildT>::value>::type
-CreateNanoGrid<SrcGridT>::processLeafs()
-{
-    static_assert(is_same<float, SrcValueT>::value, "Expected SrcValueT == float");
-    DitherLUT lut(mDitherOn);
-    forEach(0, mSrcNodeAcc.nodeCount(0), 8, [&](const Range1D& r) {
-        for (auto i = r.begin(); i != r.end(); ++i) {
-            auto &srcLeaf = mSrcNodeAcc.template node<0>(i);
-            auto *dstData = this->template dstNode<DstBuildT,0>(i)->data();
-            dstData->mBBoxMin = srcLeaf.origin(); // copy origin of node
-            dstData->mBBoxDif[0] = dstData->mBBoxDif[1] = dstData->mBBoxDif[2] = 0u;
-            const uint8_t logBitWidth = mCodec[i].log2;
-            dstData->mFlags = logBitWidth << 5;// pack logBitWidth into 3 MSB of mFlag
-            dstData->mValueMask = srcLeaf.getValueMask(); // copy value mask
-            const float min = mCodec[i].min, max = mCodec[i].max;
-            dstData->init(min, max, uint8_t(1) << logBitWidth);
-            // perform quantization relative to the values in the current leaf node
-            uint32_t offset = 0;
-            float encode = 0.0f;
-            auto quantize = [&]()->uint8_t{
-                const uint8_t tmp = static_cast<uint8_t>(encode * (srcLeaf.getValue(offset) - min) + lut(offset));
-                ++offset;
-                return tmp;
-            };
-            auto *dst = reinterpret_cast<uint8_t*>(dstData+1);
-            switch (logBitWidth) {
-                case 0u: {// 1 bit
-                    encode = 1.0f/(max - min);
-                    for (int j=0; j<64; ++j) {
-                        uint8_t a = 0;
-                        for (int k=0; k<8; ++k) a |= quantize() << k;
-                        *dst++ = a;
-                    }
-                }
-                break;
-                case 1u: {// 2 bits
-                    encode = 3.0f/(max - min);
-                    for (int j=0; j<128; ++j) {
-                        auto a = quantize();
-                        a     |= quantize() << 2;
-                        a     |= quantize() << 4;
-                        *dst++ = quantize() << 6 | a;
-                    }
-                }
-                break;
-                case 2u: {// 4 bits
-                    encode = 15.0f/(max - min);
-                    for (int j=0; j<128; ++j) {
-                        auto a = quantize();
-                        *dst++ = quantize() << 4 | a;
-                        a      = quantize();
-                        *dst++ = quantize() << 4 | a;
-                    }
-                }
-                break;
-                case 3u: {// 8 bits
-                    encode = 255.0f/(max - min);
-                    for (int j=0; j<128; ++j) {
-                        *dst++ = quantize();
-                        *dst++ = quantize();
-                        *dst++ = quantize();
-                        *dst++ = quantize();
-                    }
-                }
-                break;
-                default: {// 16 bits - special implementation using higher bit-precision
-                    auto *dst = reinterpret_cast<uint16_t*>(dstData+1);
-                    const double encode = 65535.0/(max - min);// note that double is required!
-                    for (int j=0; j<128; ++j) {
-                        *dst++ = uint16_t(encode * (srcLeaf.getValue(offset) - min) + lut(offset)); ++offset;
-                        *dst++ = uint16_t(encode * (srcLeaf.getValue(offset) - min) + lut(offset)); ++offset;
-                        *dst++ = uint16_t(encode * (srcLeaf.getValue(offset) - min) + lut(offset)); ++offset;
-                        *dst++ = uint16_t(encode * (srcLeaf.getValue(offset) - min) + lut(offset)); ++offset;
-                    }
-                }
-            }// end switch
-        }
-    });// kernel
-} // CreateNanoGrid::processLeafs<FpN>
-
-//================================================================================================
-
-template <typename SrcGridT>
-template <typename DstBuildT, int LEVEL>
-inline typename enable_if<!BuildTraits<DstBuildT>::is_index>::type
-CreateNanoGrid<SrcGridT>::processInternalNodes()
-{
-    using DstNodeT  = typename NanoNode<DstBuildT, LEVEL>::type;
-    using DstValueT = typename DstNodeT::ValueType;
-    using DstChildT = typename NanoNode<DstBuildT, LEVEL-1>::type;
-    static_assert(LEVEL == 1 || LEVEL == 2, "Expected internal node");
-
-    const uint64_t nodeCount = mSrcNodeAcc.nodeCount(LEVEL);
-    if (nodeCount > 0) {// compute and temporarily encode IDs of child nodes
-        uint64_t childCount = 0;
-        auto *dstData = this->template dstNode<DstBuildT,LEVEL>(0)->data();
-        for (uint64_t i=0; i<nodeCount; ++i) {
-            dstData[i].mFlags = childCount;
-            childCount += mSrcNodeAcc.template node<LEVEL>(i).getChildMask().countOn();
-        }
-    }
-
-    forEach(0, nodeCount, 4, [&](const Range1D& r) {
-        auto *dstData = this->template dstNode<DstBuildT,LEVEL>(r.begin())->data();
-        for (auto i = r.begin(); i != r.end(); ++i, ++dstData) {
-            auto &srcNode  = mSrcNodeAcc.template node<LEVEL>(i);
-            uint64_t childID = dstData->mFlags;
-            if (DstNodeT::DataType::padding()>0u) {
-                // Cast to void* to avoid compiler warning about missing trivial copy-assignment
-                std::memset(reinterpret_cast<void*>(dstData), 0, DstNodeT::memUsage());
-            } else {
-                dstData->mFlags = 0;// enable rendering, no bbox, no stats
-                dstData->mMinimum = dstData->mMaximum = typename DstNodeT::ValueType();
-                dstData->mAverage = dstData->mStdDevi = 0;
-            }
-            dstData->mBBox[0]   = srcNode.origin(); // copy origin of node
-            dstData->mValueMask = srcNode.getValueMask(); // copy value mask
-            dstData->mChildMask = srcNode.getChildMask(); // copy child mask
-            for (auto it = srcNode.cbeginChildAll(); it; ++it) {
-                SrcValueT value{}; // default initialization
-                if (it.probeChild(value)) {
-                    DstChildT *dstChild = this->template dstNode<DstBuildT,LEVEL-1>(childID++);// might be Leaf<FpN>
-                    dstData->setChild(it.pos(), dstChild);
-                } else {
-                    dstData->setValue(it.pos(), static_cast<DstValueT>(value));
-                }
-            }
-        }
-    });
-} // CreateNanoGrid::processInternalNodes<T>
-
-//================================================================================================
-
-template <typename SrcGridT>
-template <typename DstBuildT, int LEVEL>
-inline typename enable_if<BuildTraits<DstBuildT>::is_index>::type
-CreateNanoGrid<SrcGridT>::processInternalNodes()
-{
-    using DstNodeT  = typename NanoNode<DstBuildT, LEVEL>::type;
-    using DstChildT = typename NanoNode<DstBuildT, LEVEL-1>::type;
-    static_assert(LEVEL == 1 || LEVEL == 2, "Expected internal node");
-    static_assert(DstNodeT::DataType::padding()==0u, "Expected internal nodes to have no padding");
-
-    const uint64_t nodeCount = mSrcNodeAcc.nodeCount(LEVEL);
-    if (nodeCount > 0) {// compute and temporarily encode IDs of child nodes
-        uint64_t childCount = 0;
-        auto *dstData = this->template dstNode<DstBuildT,LEVEL>(0)->data();
-        for (uint64_t i=0; i<nodeCount; ++i) {
-            dstData[i].mFlags = childCount;
-            childCount += mSrcNodeAcc.template node<LEVEL>(i).getChildMask().countOn();
-        }
-    }
-
-    forEach(0, nodeCount, 4, [&](const Range1D& r) {
-        auto *dstData = this->template dstNode<DstBuildT,LEVEL>(r.begin())->data();
-        for (auto i = r.begin(); i != r.end(); ++i, ++dstData) {
-            auto &srcNode  = mSrcNodeAcc.template node<LEVEL>(i);
-            uint64_t childID = dstData->mFlags;
-            dstData->mFlags = 0u;
-            dstData->mBBox[0]   = srcNode.origin(); // copy origin of node
-            dstData->mValueMask = srcNode.getValueMask(); // copy value mask
-            dstData->mChildMask = srcNode.getChildMask(); // copy child mask
-            uint64_t n = mIncludeTiles ? mValIdx[LEVEL][i] : 0u;
-            for (auto it = srcNode.cbeginChildAll(); it; ++it) {
-                SrcValueT value;
-                if (it.probeChild(value)) {
-                    DstChildT *dstChild = this->template dstNode<DstBuildT,LEVEL-1>(childID++);// might be Leaf<FpN>
-                    dstData->setChild(it.pos(), dstChild);
-                } else {
-                    uint64_t m = 0u;
-                    if (mIncludeTiles && !((BuildTraits<DstBuildT>::is_onindex) && dstData->mValueMask.isOff(it.pos()))) m = n++;
-                    dstData->setValue(it.pos(), m);
-                }
-            }
-            if (mIncludeTiles && mIncludeStats) {// stats are always placed after the tile values
-                dstData->mMinimum = n++;
-                dstData->mMaximum = n++;
-                dstData->mAverage = n++;
-                dstData->mStdDevi = n++;
-            } else {// if not tiles or stats set stats to the background offset
-                dstData->mMinimum = 0u;
-                dstData->mMaximum = 0u;
-                dstData->mAverage = 0u;
-                dstData->mStdDevi = 0u;
-            }
-        }
-    });
-} // CreateNanoGrid::processInternalNodes<ValueIndex or ValueOnIndex>
-
-//================================================================================================
-
-template <typename SrcGridT>
-template <typename DstBuildT>
-inline typename enable_if<!BuildTraits<DstBuildT>::is_index>::type
-CreateNanoGrid<SrcGridT>::processRoot()
-{
-    using DstRootT  = NanoRoot<DstBuildT>;
-    using DstValueT = typename DstRootT::ValueType;
-    auto &srcRoot = mSrcNodeAcc.root();
-    auto *dstData = this->template dstRoot<DstBuildT>()->data();
-    const uint32_t tableSize = srcRoot.getTableSize();
-    // Cast to void* to avoid compiler warning about missing trivial copy-assignment
-    if (DstRootT::DataType::padding()>0) std::memset(reinterpret_cast<void*>(dstData), 0, DstRootT::memUsage(tableSize));
-    dstData->mTableSize = tableSize;
-    dstData->mMinimum = dstData->mMaximum = dstData->mBackground = srcRoot.background();
-    dstData->mBBox = CoordBBox(); // // set to an empty bounding box
-    if (tableSize==0) return;
-    auto *dstChild = this->template dstNode<DstBuildT, 2>(0);// fixed size and linear in memory
-    auto *dstTile  = dstData->tile(0);// fixed size and linear in memory
-    for (auto it = srcRoot.cbeginChildAll(); it; ++it, ++dstTile) {
-        SrcValueT value;
-        if (it.probeChild(value)) {
-            dstTile->setChild(it.getCoord(), dstChild++, dstData);
-        } else {
-            dstTile->setValue(it.getCoord(), it.isValueOn(), static_cast<DstValueT>(value));
-        }
-    }
-} // CreateNanoGrid::processRoot<T>
-
-//================================================================================================
-
-template <typename SrcGridT>
-template <typename DstBuildT>
-inline typename enable_if<BuildTraits<DstBuildT>::is_index>::type
-CreateNanoGrid<SrcGridT>::processRoot()
-{
-    using DstRootT  = NanoRoot<DstBuildT>;
-    auto &srcRoot = mSrcNodeAcc.root();
-    auto *dstData = this->template dstRoot<DstBuildT>()->data();
-    const uint32_t tableSize = srcRoot.getTableSize();
-    // Cast to void* to avoid compiler warning about missing trivial copy-assignment
-    if (DstRootT::DataType::padding()>0) std::memset(reinterpret_cast<void*>(dstData), 0, DstRootT::memUsage(tableSize));
-    dstData->mTableSize = tableSize;
-    dstData->mBackground = 0u;
-    uint64_t valueCount = 0u;// the first entry is always the background value
-    dstData->mBBox = CoordBBox(); // set to an empty/invalid bounding box
-
-    if (tableSize>0) {
-        auto *dstChild = this->template dstNode<DstBuildT, 2>(0);// fixed size and linear in memory
-        auto *dstTile  = dstData->tile(0);// fixed size and linear in memory
-        for (auto it = srcRoot.cbeginChildAll(); it; ++it, ++dstTile) {
-            SrcValueT tmp;
-            if (it.probeChild(tmp)) {
-                dstTile->setChild(it.getCoord(), dstChild++, dstData);
-            } else {
-                dstTile->setValue(it.getCoord(), it.isValueOn(), 0u);
-                if (mIncludeTiles && !((BuildTraits<DstBuildT>::is_onindex) && !dstTile->state)) dstTile->value = ++valueCount;
-            }
-        }
-    }
-    if (mIncludeTiles && mIncludeStats) {// stats are always placed after the tile values
-        dstData->mMinimum = ++valueCount;
-        dstData->mMaximum = ++valueCount;
-        dstData->mAverage = ++valueCount;
-        dstData->mStdDevi = ++valueCount;
-    } else if (dstData->padding()==0) {
-        dstData->mMinimum = 0u;
-        dstData->mMaximum = 0u;
-        dstData->mAverage = 0u;
-        dstData->mStdDevi = 0u;
-    }
-} // CreateNanoGrid::processRoot<ValueIndex or ValueOnIndex>
-
-//================================================================================================
-
-template <typename SrcGridT>
-template <typename DstBuildT>
-void CreateNanoGrid<SrcGridT>::processTree()
-{
-    const uint64_t nodeCount[3] = {mSrcNodeAcc.nodeCount(0), mSrcNodeAcc.nodeCount(1), mSrcNodeAcc.nodeCount(2)};
-    auto *dstTree = this->template dstTree<DstBuildT>();
-    auto *dstData = dstTree->data();
-    dstData->setRoot( this->template dstRoot<DstBuildT>() );
-
-    dstData->setFirstNode(nodeCount[2] ? this->template dstNode<DstBuildT, 2>(0) : nullptr);
-    dstData->setFirstNode(nodeCount[1] ? this->template dstNode<DstBuildT, 1>(0) : nullptr);
-    dstData->setFirstNode(nodeCount[0] ? this->template dstNode<DstBuildT, 0>(0) : nullptr);
-
-    dstData->mNodeCount[0] = static_cast<uint32_t>(nodeCount[0]);
-    dstData->mNodeCount[1] = static_cast<uint32_t>(nodeCount[1]);
-    dstData->mNodeCount[2] = static_cast<uint32_t>(nodeCount[2]);
-
-    // Count number of active leaf level tiles
-    dstData->mTileCount[0] = reduce(Range1D(0,nodeCount[1]), uint32_t(0), [&](Range1D &r, uint32_t sum){
-        for (auto i=r.begin(); i!=r.end(); ++i) sum += mSrcNodeAcc.template node<1>(i).getValueMask().countOn();
-        return sum;}, std::plus<uint32_t>());
-
-    // Count number of active lower internal node tiles
-    dstData->mTileCount[1] = reduce(Range1D(0,nodeCount[2]), uint32_t(0), [&](Range1D &r, uint32_t sum){
-        for (auto i=r.begin(); i!=r.end(); ++i) sum += mSrcNodeAcc.template node<2>(i).getValueMask().countOn();
-        return sum;}, std::plus<uint32_t>());
-
-    // Count number of active upper internal node tiles
-    dstData->mTileCount[2] = 0;
-    for (auto it = mSrcNodeAcc.root().cbeginValueOn(); it; ++it) dstData->mTileCount[2] += 1;
-
-    // Count number of active voxels
-    dstData->mVoxelCount = reduce(Range1D(0, nodeCount[0]), uint64_t(0), [&](Range1D &r, uint64_t sum){
-        for (auto i=r.begin(); i!=r.end(); ++i) sum += mSrcNodeAcc.template node<0>(i).getValueMask().countOn();
-        return sum;}, std::plus<uint64_t>());
-
-    dstData->mVoxelCount += uint64_t(dstData->mTileCount[0]) <<  9;// = 3 * 3
-    dstData->mVoxelCount += uint64_t(dstData->mTileCount[1]) << 21;// = 3 * (3+4)
-    dstData->mVoxelCount += uint64_t(dstData->mTileCount[2]) << 36;// = 3 * (3+4+5)
-
-} // CreateNanoGrid::processTree
-
-//================================================================================================
-
-template <typename SrcGridT>
-template <typename DstBuildT>
-void CreateNanoGrid<SrcGridT>::processGrid()
-{
-    auto* dstData = this->template dstGrid<DstBuildT>()->data();
-    dstData->init({GridFlags::IsBreadthFirst}, mOffset.size, mSrcNodeAcc.map(),
-                  mapToGridType<DstBuildT>(), mapToGridClass<DstBuildT>(mSrcNodeAcc.gridClass()));
-    dstData->mBlindMetadataCount = static_cast<uint32_t>(mBlindMetaData.size());
-    dstData->mData1 = this->valueCount();
-
-    if (!isValid(dstData->mGridType, dstData->mGridClass)) {
-#if 1
-        fprintf(stderr,"Warning: Strange combination of GridType(\"%s\") and GridClass(\"%s\"). Consider changing GridClass to \"Unknown\"\n",
-                toStr(dstData->mGridType), toStr(dstData->mGridClass));
-#else
-        throw std::runtime_error("Invalid combination of GridType("+std::to_string(int(dstData->mGridType))+
-                                 ") and GridClass("+std::to_string(int(dstData->mGridClass))+"). See NanoVDB.h for details!");
-#endif
-    }
-
-    std::memset(dstData->mGridName, '\0', GridData::MaxNameSize);//overwrite mGridName
-    strncpy(dstData->mGridName, mSrcNodeAcc.getName().c_str(), GridData::MaxNameSize-1);
-    if (mSrcNodeAcc.hasLongGridName()) dstData->setLongGridNameOn();// grid name is long so store it as blind data
-
-    // Partially process blind meta data - they will be complete in postProcess
-    if (mBlindMetaData.size()>0) {
-        auto *metaData = this->dstMeta(0);
-        dstData->mBlindMetadataOffset = PtrDiff(metaData, dstData);
-        dstData->mBlindMetadataCount = static_cast<uint32_t>(mBlindMetaData.size());
-        char *blindData = PtrAdd<char>(mBufferPtr, mOffset.blind);
-        for (const auto &b : mBlindMetaData) {
-            std::memcpy(metaData, b.metaData, sizeof(GridBlindMetaData));
-            metaData->setBlindData(blindData);// sets metaData.mOffset
-            if (metaData->mDataClass == GridBlindDataClass::GridName) strcpy(blindData, mSrcNodeAcc.getName().c_str());
-            ++metaData;
-            blindData += b.size;
-        }
-        mBlindMetaData.clear();
-    }
-} // CreateNanoGrid::processGrid
-
-//================================================================================================
-
-template <typename SrcGridT>
-template <typename DstBuildT>
-inline typename disable_if<BuildTraits<DstBuildT>::is_index>::type
-CreateNanoGrid<SrcGridT>::postProcess()
-{
-    if constexpr(is_same<FpN, DstBuildT>::value) mCodec.reset();
-    auto *dstGrid = this->template dstGrid<DstBuildT>();
-    gridStats(*dstGrid, mStats);
-#if defined(NANOVDB_USE_OPENVDB) && !defined(__CUDACC__)
-    auto *metaData = this->dstMeta(0);
-    if constexpr(is_same<openvdb::tools::PointIndexGrid, SrcGridT>::value ||
-                 is_same<openvdb::points::PointDataGrid, SrcGridT>::value) {
-        static_assert(is_same<DstBuildT, uint32_t>::value, "expected DstBuildT==uint32_t");
-        auto *dstData0 = this->template dstNode<DstBuildT,0>(0)->data();
-        dstData0->mMinimum = 0; // start of prefix sum
-        dstData0->mMaximum = dstData0->mValues[511u];
-        for (uint32_t i=1, n=mSrcNodeAcc.nodeCount(0); i<n; ++i) {
-            auto *dstData1 = dstData0 + 1;
-            dstData1->mMinimum = dstData0->mMinimum + dstData0->mMaximum;
-            dstData1->mMaximum = dstData1->mValues[511u];
-            dstData0 = dstData1;
-        }
-        for (size_t i = 0, n = dstGrid->blindDataCount(); i < n; ++i, ++metaData) {
-            if constexpr(is_same<openvdb::tools::PointIndexGrid, SrcGridT>::value) {
-                if (metaData->mDataClass != GridBlindDataClass::IndexArray) continue;
-                if (metaData->mDataType == GridType::UInt32) {
-                    uint32_t *blindData = const_cast<uint32_t*>(metaData->template getBlindData<uint32_t>());
-                    forEach(0, mSrcNodeAcc.nodeCount(0), 16, [&](const auto& r) {
-                        auto *dstData = this->template dstNode<DstBuildT,0>(r.begin())->data();
-                        for (auto j = r.begin(); j != r.end(); ++j, ++dstData) {
-                            uint32_t* p = blindData + dstData->mMinimum;
-                            for (uint32_t idx : mSrcNodeAcc.template node<0>(j).indices()) *p++ = idx;
-                        }
-                    });
-                }
-            } else {// if constexpr(is_same<openvdb::points::PointDataGrid, SrcGridT>::value)
-                if (metaData->mDataClass != GridBlindDataClass::AttributeArray) continue;
-                if (auto *blindData = dstGrid->template getBlindData<float>(i)) {
-                    this->template copyPointAttribute<DstBuildT>(i, blindData);
-                } else if (auto *blindData = dstGrid->template getBlindData<nanovdb::Vec3f>(i)) {
-                    this->template copyPointAttribute<DstBuildT>(i, reinterpret_cast<openvdb::Vec3f*>(blindData));
-                } else if (auto *blindData = dstGrid->template getBlindData<int32_t>(i)) {
-                    this->template copyPointAttribute<DstBuildT>(i, blindData);
-                } else if (auto *blindData = dstGrid->template getBlindData<int64_t>(i)) {
-                    this->template copyPointAttribute<DstBuildT>(i, blindData);
-                } else {
-                    std::cerr << "unsupported point attribute \"" << toStr(metaData->mDataType) << "\"\n";
-                }
-            }// if
-        }// loop
-    } else { // if
-        (void)metaData;
-    }
-#endif
-    updateChecksum(*dstGrid, mChecksum);
-}// CreateNanoGrid::postProcess<T>
-
-//================================================================================================
-
-template <typename SrcGridT>
-template <typename DstBuildT>
-inline typename enable_if<BuildTraits<DstBuildT>::is_index>::type
-CreateNanoGrid<SrcGridT>::postProcess(uint32_t channels)
-{
-    const std::string typeName = toStr(mapToGridType<SrcValueT>());
-    const uint64_t valueCount = this->valueCount();
-    auto *dstGrid = this->template dstGrid<DstBuildT>();
-    for (uint32_t i=0; i<channels; ++i) {
-        const std::string name = "channel_"+std::to_string(i);
-        int j = dstGrid->findBlindData(name.c_str());
-        if (j<0) throw std::runtime_error("missing " + name);
-        auto *metaData = this->dstMeta(j);// partially set in processGrid
-        metaData->mDataClass = GridBlindDataClass::ChannelArray;
-        metaData->mDataType  = mapToGridType<SrcValueT>();
-        SrcValueT *blindData = const_cast<SrcValueT*>(metaData->template getBlindData<SrcValueT>());
-        if (i>0) {// concurrent copy from previous channel
-            nanovdb::forEach(0,valueCount,1024,[&](const nanovdb::Range1D &r){
-                SrcValueT *dst=blindData+r.begin(), *end=dst+r.size(), *src=dst-valueCount;
-                while(dst!=end) *dst++ = *src++;
-            });
-        } else {
-            this->template copyValues<DstBuildT>(blindData);
-        }
-    }// loop over channels
-    gridStats(*(this->template dstGrid<DstBuildT>()), std::min(StatsMode::BBox, mStats));
-    updateChecksum(*dstGrid, mChecksum);
-}// CreateNanoGrid::postProcess<ValueIndex or ValueOnIndex>
-
-//================================================================================================
-
-template <typename SrcGridT>
-template <typename DstBuildT>
-typename enable_if<BuildTraits<DstBuildT>::is_index>::type
-CreateNanoGrid<SrcGridT>::copyValues(SrcValueT *buffer)
-{// copy values from the source grid into the provided buffer
-    assert(mBufferPtr && buffer);
-    using StatsT = typename FloatTraits<SrcValueT>::FloatType;
-
-    if (this->valueCount()==0) this->template countValues<DstBuildT>();
-
-    auto copyNodeValues = [&](const auto &node, SrcValueT *v) {
-        if constexpr(BuildTraits<DstBuildT>::is_onindex) {
-            for (auto it = node.cbeginValueOn(); it; ++it) *v++ = *it;
-        } else {
-            for (auto it = node.cbeginValueAll(); it; ++it) *v++ = *it;
-        }
-        if (mIncludeStats) {
-            if constexpr(SrcNodeAccT::IS_NANOVDB) {// resolved at compile time
-                *v++ = node.minimum();
-                *v++ = node.maximum();
-                if constexpr(is_same<SrcValueT, StatsT>::value) {
-                    *v++ = node.average();
-                    *v++ = node.stdDeviation();
-                } else {// eg when SrcValueT=Vec3f and StatsT=float
-                    *v++ = SrcValueT(node.average());
-                    *v++ = SrcValueT(node.stdDeviation());
-                }
-            } else {// openvdb and nanovdb::build::Grid have no stats
-                *v++ = buffer[0];// background
-                *v++ = buffer[0];// background
-                *v++ = buffer[0];// background
-                *v++ = buffer[0];// background
-            }
-        }
-    };// copyNodeValues
-
-    const SrcRootT &root = mSrcNodeAcc.root();
-    buffer[0] = root.background();// Value array always starts with the background value
-    if (mIncludeTiles) {
-        copyNodeValues(root, buffer + 1u);
-        forEach(0, mSrcNodeAcc.nodeCount(2), 1, [&](const Range1D& r) {
-            for (auto i = r.begin(); i!=r.end(); ++i) {
-                copyNodeValues(mSrcNodeAcc.template node<2>(i), buffer + mValIdx[2][i]);
-            }
-        });
-        forEach(0, mSrcNodeAcc.nodeCount(1), 1, [&](const Range1D& r) {
-            for (auto i = r.begin(); i!=r.end(); ++i) {
-                copyNodeValues(mSrcNodeAcc.template node<1>(i), buffer + mValIdx[1][i]);
-            }
-        });
-    }
-    forEach(0, mSrcNodeAcc.nodeCount(0), 4, [&](const Range1D& r) {
-        for (auto i = r.begin(); i!=r.end(); ++i) {
-            copyNodeValues(mSrcNodeAcc.template node<0>(i), buffer + mValIdx[0][i]);
-        }
-    });
-}// CreateNanoGrid::copyValues<ValueIndex or ValueOnIndex>
-
-
-//================================================================================================
-
-#if defined(NANOVDB_USE_OPENVDB) && !defined(__CUDACC__)
-
-template <typename SrcGridT>
-template<typename T>
-typename disable_if<is_same<T, openvdb::tools::PointIndexGrid>::value ||
-                    is_same<T, openvdb::points::PointDataGrid>::value, uint64_t>::type
-CreateNanoGrid<SrcGridT>::countPoints() const
-{
-    static_assert(is_same<T, SrcGridT>::value, "expected default template parameter");
-    return 0u;
-}// CreateNanoGrid::countPoints<T>
-
-template <typename SrcGridT>
-template<typename T>
-typename enable_if<is_same<T, openvdb::tools::PointIndexGrid>::value ||
-                   is_same<T, openvdb::points::PointDataGrid>::value, uint64_t>::type
-CreateNanoGrid<SrcGridT>::countPoints() const
-{
-    static_assert(is_same<T, SrcGridT>::value, "expected default template parameter");
-    return reduce(0, mSrcNodeAcc.nodeCount(0), 8, uint64_t(0), [&](auto &r, uint64_t sum) {
-        for (auto i=r.begin(); i!=r.end(); ++i) sum += mSrcNodeAcc.template node<0>(i).getLastValue();
-        return sum;}, std::plus<uint64_t>());
-}// CreateNanoGrid::countPoints<PointIndexGrid or PointDataGrid>
-
-template <typename SrcGridT>
-template<typename DstBuildT, typename AttT, typename CodecT, typename T>
-typename enable_if<is_same<openvdb::points::PointDataGrid, T>::value>::type
-CreateNanoGrid<SrcGridT>::copyPointAttribute(size_t attIdx, AttT *attPtr)
-{
-    static_assert(std::is_same<SrcGridT, T>::value, "Expected default parameter");
-    using HandleT = openvdb::points::AttributeHandle<AttT, CodecT>;
-    forEach(0, mSrcNodeAcc.nodeCount(0), 16, [&](const auto& r) {
-        auto *dstData = this->template dstNode<DstBuildT,0>(r.begin())->data();
-        for (auto i = r.begin(); i != r.end(); ++i, ++dstData) {
-            auto& srcLeaf = mSrcNodeAcc.template node<0>(i);
-            HandleT handle(srcLeaf.constAttributeArray(attIdx));
-            AttT *p = attPtr + dstData->mMinimum;
-            for (auto iter = srcLeaf.beginIndexOn(); iter; ++iter) *p++ = handle.get(*iter);
-        }
-    });
-}// CreateNanoGrid::copyPointAttribute
-
-#endif
-
-//================================================================================================
-
-template<typename SrcGridT, typename DstBuildT, typename BufferT>
-typename disable_if<BuildTraits<DstBuildT>::is_index || BuildTraits<DstBuildT>::is_Fp, GridHandle<BufferT>>::type
-createNanoGrid(const SrcGridT &srcGrid,
-               StatsMode sMode,
-               ChecksumMode cMode,
-               int verbose,
-               const BufferT &buffer)
-{
-    CreateNanoGrid<SrcGridT> converter(srcGrid);
-    converter.setStats(sMode);
-    converter.setChecksum(cMode);
-    converter.setVerbose(verbose);
-    return converter.template getHandle<DstBuildT, BufferT>(buffer);
-}// createNanoGrid<T>
-
-//================================================================================================
-
-template<typename SrcGridT, typename DstBuildT, typename BufferT>
-typename enable_if<BuildTraits<DstBuildT>::is_index, GridHandle<BufferT>>::type
-createNanoGrid(const SrcGridT &srcGrid,
-               uint32_t channels,
-               bool includeStats,
-               bool includeTiles,
-               int verbose,
-               const BufferT &buffer)
-{
-    CreateNanoGrid<SrcGridT> converter(srcGrid);
-    converter.setVerbose(verbose);
-    return converter.template getHandle<DstBuildT, BufferT>(channels, includeStats, includeTiles, buffer);
-}
-
-//================================================================================================
-
-template<typename SrcGridT, typename DstBuildT, typename OracleT, typename BufferT>
-typename enable_if<is_same<FpN, DstBuildT>::value, GridHandle<BufferT>>::type
-createNanoGrid(const SrcGridT &srcGrid,
-               StatsMode sMode,
-               ChecksumMode cMode,
-               bool ditherOn,
-               int verbose,
-               const OracleT &oracle,
-               const BufferT &buffer)
-{
-    CreateNanoGrid<SrcGridT> converter(srcGrid);
-    converter.setStats(sMode);
-    converter.setChecksum(cMode);
-    converter.enableDithering(ditherOn);
-    converter.setVerbose(verbose);
-    return converter.template getHandle<DstBuildT, OracleT, BufferT>(oracle, buffer);
-}// createNanoGrid<FpN>
-
-//================================================================================================
-
-template<typename SrcGridT, typename DstBuildT, typename BufferT>
-typename enable_if<BuildTraits<DstBuildT>::is_FpX, GridHandle<BufferT>>::type
-createNanoGrid(const SrcGridT &srcGrid,
-               StatsMode sMode,
-               ChecksumMode cMode,
-               bool ditherOn,
-               int verbose,
-               const BufferT &buffer)
-{
-    CreateNanoGrid<SrcGridT> converter(srcGrid);
-    converter.setStats(sMode);
-    converter.setChecksum(cMode);
-    converter.enableDithering(ditherOn);
-    converter.setVerbose(verbose);
-    return converter.template getHandle<DstBuildT, BufferT>(buffer);
-}// createNanoGrid<Fp4,8,16>
-
-//================================================================================================
-
-#if defined(NANOVDB_USE_OPENVDB) && !defined(__CUDACC__)
-template<typename BufferT>
-GridHandle<BufferT>
-openToNanoVDB(const openvdb::GridBase::Ptr& base,
-              StatsMode                     sMode,
-              ChecksumMode                  cMode,
-              int                           verbose)
-{
-    // We need to define these types because they are not defined in OpenVDB
-    using openvdb_Vec4fTree = typename openvdb::tree::Tree4<openvdb::Vec4f, 5, 4, 3>::Type;
-    using openvdb_Vec4dTree = typename openvdb::tree::Tree4<openvdb::Vec4d, 5, 4, 3>::Type;
-    using openvdb_Vec4fGrid = openvdb::Grid<openvdb_Vec4fTree>;
-    using openvdb_Vec4dGrid = openvdb::Grid<openvdb_Vec4dTree>;
-    using openvdb_UInt32Grid = openvdb::Grid<openvdb::UInt32Tree>;
-
-    if (auto grid = openvdb::GridBase::grid<openvdb::FloatGrid>(base)) {
-        return createNanoGrid<openvdb::FloatGrid, float, BufferT>(*grid, sMode, cMode, verbose);
-    } else if (auto grid = openvdb::GridBase::grid<openvdb::DoubleGrid>(base)) {
-        return createNanoGrid<openvdb::DoubleGrid, double, BufferT>(*grid, sMode, cMode, verbose);
-    } else if (auto grid = openvdb::GridBase::grid<openvdb::Int32Grid>(base)) {
-        return createNanoGrid<openvdb::Int32Grid, int32_t,BufferT>(*grid, sMode, cMode, verbose);
-    } else if (auto grid = openvdb::GridBase::grid<openvdb::Int64Grid>(base)) {
-        return createNanoGrid<openvdb::Int64Grid, int64_t, BufferT>(*grid, sMode, cMode, verbose);
-    } else if (auto grid = openvdb::GridBase::grid<openvdb_UInt32Grid>(base)) {
-        return createNanoGrid<openvdb_UInt32Grid, uint32_t, BufferT>(*grid, sMode, cMode, verbose);
-    } else if (auto grid = openvdb::GridBase::grid<openvdb::Vec3fGrid>(base)) {
-        return createNanoGrid<openvdb::Vec3fGrid, nanovdb::Vec3f, BufferT>(*grid, sMode, cMode, verbose);
-    } else if (auto grid = openvdb::GridBase::grid<openvdb::Vec3dGrid>(base)) {
-        return createNanoGrid<openvdb::Vec3dGrid, nanovdb::Vec3d, BufferT>(*grid, sMode, cMode, verbose);
-    } else if (auto grid = openvdb::GridBase::grid<openvdb::tools::PointIndexGrid>(base)) {
-        return createNanoGrid<openvdb::tools::PointIndexGrid, uint32_t, BufferT>(*grid, sMode, cMode, verbose);
-    } else if (auto grid = openvdb::GridBase::grid<openvdb::points::PointDataGrid>(base)) {
-        return createNanoGrid<openvdb::points::PointDataGrid, uint32_t, BufferT>(*grid, sMode, cMode, verbose);
-    } else if (auto grid = openvdb::GridBase::grid<openvdb::MaskGrid>(base)) {
-        return createNanoGrid<openvdb::MaskGrid, nanovdb::ValueMask, BufferT>(*grid, sMode, cMode, verbose);
-    } else if (auto grid = openvdb::GridBase::grid<openvdb::BoolGrid>(base)) {
-        return createNanoGrid<openvdb::BoolGrid, bool, BufferT>(*grid, sMode, cMode, verbose);
-    } else if (auto grid = openvdb::GridBase::grid<openvdb_Vec4fGrid>(base)) {
-        return createNanoGrid<openvdb_Vec4fGrid, nanovdb::Vec4f, BufferT>(*grid, sMode, cMode, verbose);
-    } else if (auto grid = openvdb::GridBase::grid<openvdb_Vec4dGrid>(base)) {
-        return createNanoGrid<openvdb_Vec4dGrid, nanovdb::Vec4d, BufferT>(*grid, sMode, cMode, verbose);
-    } else {
-        OPENVDB_THROW(openvdb::RuntimeError, "Unrecognized OpenVDB grid type");
-    }
-}// openToNanoVDB
-#endif
-
-} // namespace nanovdb
-
-#endif // NANOVDB_CREATE_NANOGRID_H_HAS_BEEN_INCLUDED
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/CreateNanoGrid.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/tools/CreateNanoGrid.h instead.")
diff --git a/nanovdb/nanovdb/util/DitherLUT.h b/nanovdb/nanovdb/util/DitherLUT.h
index 69c3b33031..270f82d378 100644
--- a/nanovdb/nanovdb/util/DitherLUT.h
+++ b/nanovdb/nanovdb/util/DitherLUT.h
@@ -1,185 +1,6 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
-//
-/// @author Jeff Lait
-///
-/// @date  May 13, 2021
-///
-/// @file DitherLUT.h
-///
-/// @brief Defines look up table to do dithering of 8^3 leaf nodes.
 
-#ifndef NANOVDB_DITHERLUT_HAS_BEEN_INCLUDED
-#define NANOVDB_DITHERLUT_HAS_BEEN_INCLUDED
-
-#include <nanovdb/NanoVDB.h>// for __hostdev__, Vec3, Min, Max, Pow2, Pow3, Pow4
-
-namespace nanovdb {
-
-class DitherLUT
-{
-    const bool mEnable;
-public:
-    /// @brief Constructor with an optional scaling factor for the dithering
-    __hostdev__ DitherLUT(bool enable = true) : mEnable(enable) {}
-
-    /// @brief Retrieves dither threshold for an offset within an 8^3 leaf nodes.
-    ///
-    /// @param offset into the lookup table of size 512
-    __hostdev__ float operator()(const int offset)
-    {
-
-// This table was generated with
-/**************
-
-static constexpr inline uint32
-SYSwang_inthash(uint32 key)
-{
-    // From http://www.concentric.net/~Ttwang/tech/inthash.htm
-    key += ~(key << 16);
-    key ^=  (key >> 5);
-    key +=  (key << 3);
-    key ^=  (key >> 13);
-    key += ~(key << 9);
-    key ^=  (key >> 17);
-    return key;
-}
-
-static void
-ut_initDitherR(float *pattern, float offset,
-    int x, int y, int z, int res, int goalres)
-{
-    // These offsets are designed to maximize the difference between
-    // dither values in nearby voxels within a given 2x2x2 cell, without
-    // producing axis-aligned artifacts.  The are organized in row-major
-    // order.
-    static const float  theDitherOffset[] = {0,4,6,2,5,1,3,7};
-    static const float  theScale = 0.125F;
-    int         key = (((z << res) + y) << res) + x;
-
-    if (res == goalres)
-    {
-    pattern[key] = offset;
-    return;
-    }
-
-    // Randomly flip (on each axis) the dithering patterns used by the
-    // subcells.  This key is xor'd with the subcell index below before
-    // looking up in the dither offset list.
-    key = SYSwang_inthash(key) & 7;
-
-    x <<= 1;
-    y <<= 1;
-    z <<= 1;
-
-    offset *= theScale;
-    for (int i = 0; i < 8; i++)
-    ut_initDitherR(pattern, offset+theDitherOffset[i ^ key]*theScale,
-        x+(i&1), y+((i&2)>>1), z+((i&4)>>2), res+1, goalres);
-}
-
-// This is a compact algorithm that accomplishes essentially the same thing
-// as ut_initDither() above.  We should eventually switch to use this and
-// clean the dead code.
-static fpreal32 *
-ut_initDitherRecursive(int goalres)
-{
-    const int nfloat = 1 << (goalres*3);
-    float   *pattern = new float[nfloat];
-    ut_initDitherR(pattern, 1.0F, 0, 0, 0, 0, goalres);
-
-    // This has built an even spacing from 1/nfloat to 1.0.
-    // however, our dither pattern should be 1/(nfloat+1) to nfloat/(nfloat+1)
-    // So we do a correction here.  Note that the earlier calculations are
-    // done with powers of 2 so are exact, so it does make sense to delay
-    // the renormalization to this pass.
-    float correctionterm = nfloat / (nfloat+1.0F);
-    for (int i = 0; i < nfloat; i++)
-        pattern[i] *= correctionterm;
-    return pattern;
-}
-
-    theDitherMatrix = ut_initDitherRecursive(3);
-
-    for (int i = 0; i < 512/8; i ++)
-    {
-        for (int j = 0; j < 8; j ++)
-            std::cout << theDitherMatrix[i*8+j] << "f, ";
-        std::cout << std::endl;
-    }
-
- **************/
-        static const float LUT[512] =
-        {
-            0.14425f, 0.643275f, 0.830409f, 0.331384f, 0.105263f, 0.604289f, 0.167641f, 0.666667f,
-            0.892788f, 0.393762f, 0.0818713f, 0.580897f, 0.853801f, 0.354776f, 0.916179f, 0.417154f,
-            0.612086f, 0.11306f, 0.79922f, 0.300195f, 0.510721f, 0.0116959f, 0.947368f, 0.448343f,
-            0.362573f, 0.861598f, 0.0506823f, 0.549708f, 0.261209f, 0.760234f, 0.19883f, 0.697856f,
-            0.140351f, 0.639376f, 0.576998f, 0.0779727f, 0.522417f, 0.0233918f, 0.460039f, 0.959064f,
-            0.888889f, 0.389864f, 0.327485f, 0.826511f, 0.272904f, 0.77193f, 0.709552f, 0.210526f,
-            0.483431f, 0.982456f, 0.296296f, 0.795322f, 0.116959f, 0.615984f, 0.0545809f, 0.553606f,
-            0.732943f, 0.233918f, 0.545809f, 0.0467836f, 0.865497f, 0.366472f, 0.803119f, 0.304094f,
-            0.518519f, 0.0194932f, 0.45614f, 0.955166f, 0.729045f, 0.230019f, 0.54191f, 0.042885f,
-            0.269006f, 0.768031f, 0.705653f, 0.206628f, 0.479532f, 0.978558f, 0.292398f, 0.791423f,
-            0.237817f, 0.736842f, 0.424951f, 0.923977f, 0.136452f, 0.635478f, 0.323587f, 0.822612f,
-            0.986355f, 0.487329f, 0.674464f, 0.175439f, 0.88499f, 0.385965f, 0.573099f, 0.0740741f,
-            0.51462f, 0.0155945f, 0.202729f, 0.701754f, 0.148148f, 0.647174f, 0.834308f, 0.335283f,
-            0.265107f, 0.764133f, 0.951267f, 0.452242f, 0.896686f, 0.397661f, 0.08577f, 0.584795f,
-            0.8577f, 0.358674f, 0.920078f, 0.421053f, 0.740741f, 0.241715f, 0.678363f, 0.179337f,
-            0.109162f, 0.608187f, 0.17154f, 0.670565f, 0.491228f, 0.990253f, 0.42885f, 0.927875f,
-            0.0662768f, 0.565302f, 0.62768f, 0.128655f, 0.183236f, 0.682261f, 0.744639f, 0.245614f,
-            0.814815f, 0.315789f, 0.378168f, 0.877193f, 0.931774f, 0.432749f, 0.495127f, 0.994152f,
-            0.0350877f, 0.534113f, 0.97076f, 0.471735f, 0.214425f, 0.71345f, 0.526316f, 0.0272904f,
-            0.783626f, 0.2846f, 0.222222f, 0.721248f, 0.962963f, 0.463938f, 0.276803f, 0.775828f,
-            0.966862f, 0.467836f, 0.405458f, 0.904483f, 0.0701754f, 0.569201f, 0.881092f, 0.382066f,
-            0.218324f, 0.717349f, 0.654971f, 0.155945f, 0.818713f, 0.319688f, 0.132554f, 0.631579f,
-            0.0623782f, 0.561404f, 0.748538f, 0.249513f, 0.912281f, 0.413255f, 0.974659f, 0.475634f,
-            0.810916f, 0.311891f, 0.499025f, 0.998051f, 0.163743f, 0.662768f, 0.226121f, 0.725146f,
-            0.690058f, 0.191033f, 0.00389864f, 0.502924f, 0.557505f, 0.0584795f, 0.120858f, 0.619883f,
-            0.440546f, 0.939571f, 0.752437f, 0.253411f, 0.307992f, 0.807018f, 0.869396f, 0.37037f,
-            0.658869f, 0.159844f, 0.346979f, 0.846004f, 0.588694f, 0.0896686f, 0.152047f, 0.651072f,
-            0.409357f, 0.908382f, 0.596491f, 0.0974659f, 0.339181f, 0.838207f, 0.900585f, 0.401559f,
-            0.34308f, 0.842105f, 0.779727f, 0.280702f, 0.693957f, 0.194932f, 0.25731f, 0.756335f,
-            0.592593f, 0.0935673f, 0.0311891f, 0.530214f, 0.444444f, 0.94347f, 0.506823f, 0.00779727f,
-            0.68616f, 0.187135f, 0.124756f, 0.623782f, 0.288499f, 0.787524f, 0.350877f, 0.849903f,
-            0.436647f, 0.935673f, 0.873294f, 0.374269f, 0.538012f, 0.0389864f, 0.60039f, 0.101365f,
-            0.57115f, 0.0721248f, 0.758285f, 0.259259f, 0.719298f, 0.220273f, 0.532164f, 0.0331384f,
-            0.321637f, 0.820663f, 0.00974659f, 0.508772f, 0.469786f, 0.968811f, 0.282651f, 0.781676f,
-            0.539961f, 0.0409357f, 0.727096f, 0.22807f, 0.500975f, 0.00194932f, 0.563353f, 0.0643275f,
-            0.290448f, 0.789474f, 0.477583f, 0.976608f, 0.251462f, 0.750487f, 0.31384f, 0.812865f,
-            0.94152f, 0.442495f, 0.879142f, 0.380117f, 0.37232f, 0.871345f, 0.309942f, 0.808967f,
-            0.192982f, 0.692008f, 0.130604f, 0.62963f, 0.621832f, 0.122807f, 0.559454f, 0.0604289f,
-            0.660819f, 0.161793f, 0.723197f, 0.224172f, 0.403509f, 0.902534f, 0.840156f, 0.341131f,
-            0.411306f, 0.910331f, 0.473684f, 0.97271f, 0.653021f, 0.153996f, 0.0916179f, 0.590643f,
-            0.196881f, 0.695906f, 0.384016f, 0.883041f, 0.0955166f, 0.594542f, 0.157895f, 0.65692f,
-            0.945419f, 0.446394f, 0.633528f, 0.134503f, 0.844055f, 0.345029f, 0.906433f, 0.407407f,
-            0.165692f, 0.664717f, 0.103314f, 0.602339f, 0.126706f, 0.625731f, 0.189084f, 0.688109f,
-            0.91423f, 0.415205f, 0.851852f, 0.352827f, 0.875244f, 0.376218f, 0.937622f, 0.438596f,
-            0.317739f, 0.816764f, 0.255361f, 0.754386f, 0.996101f, 0.497076f, 0.933723f, 0.434698f,
-            0.567251f, 0.0682261f, 0.504873f, 0.00584795f, 0.247563f, 0.746589f, 0.185185f, 0.684211f,
-            0.037037f, 0.536062f, 0.0994152f, 0.598441f, 0.777778f, 0.278752f, 0.465887f, 0.964912f,
-            0.785575f, 0.28655f, 0.847953f, 0.348928f, 0.0292398f, 0.528265f, 0.7154f, 0.216374f,
-            0.39961f, 0.898636f, 0.961014f, 0.461988f, 0.0487329f, 0.547758f, 0.111111f, 0.610136f,
-            0.649123f, 0.150097f, 0.212476f, 0.711501f, 0.797271f, 0.298246f, 0.859649f, 0.360624f,
-            0.118908f, 0.617934f, 0.0565302f, 0.555556f, 0.329435f, 0.82846f, 0.516569f, 0.0175439f,
-            0.867446f, 0.368421f, 0.805068f, 0.306043f, 0.578947f, 0.079922f, 0.267057f, 0.766082f,
-            0.270955f, 0.76998f, 0.707602f, 0.208577f, 0.668616f, 0.169591f, 0.606238f, 0.107212f,
-            0.520468f, 0.0214425f, 0.45809f, 0.957115f, 0.419103f, 0.918129f, 0.356725f, 0.855751f,
-            0.988304f, 0.489279f, 0.426901f, 0.925926f, 0.450292f, 0.949318f, 0.512671f, 0.0136452f,
-            0.239766f, 0.738791f, 0.676413f, 0.177388f, 0.699805f, 0.20078f, 0.263158f, 0.762183f,
-            0.773879f, 0.274854f, 0.337232f, 0.836257f, 0.672515f, 0.173489f, 0.734893f, 0.235867f,
-            0.0253411f, 0.524366f, 0.586745f, 0.0877193f, 0.423002f, 0.922027f, 0.48538f, 0.984405f,
-            0.74269f, 0.243665f, 0.680312f, 0.181287f, 0.953216f, 0.454191f, 0.1423f, 0.641326f,
-            0.493177f, 0.992203f, 0.430799f, 0.929825f, 0.204678f, 0.703704f, 0.890838f, 0.391813f,
-            0.894737f, 0.395712f, 0.0838207f, 0.582846f, 0.0448343f, 0.54386f, 0.231969f, 0.730994f,
-            0.146199f, 0.645224f, 0.832359f, 0.333333f, 0.793372f, 0.294347f, 0.980507f, 0.481481f,
-            0.364522f, 0.863548f, 0.80117f, 0.302144f, 0.824561f, 0.325536f, 0.138402f, 0.637427f,
-            0.614035f, 0.11501f, 0.0526316f, 0.551657f, 0.0760234f, 0.575049f, 0.88694f, 0.387914f,
-        };
-        return mEnable ? LUT[offset & 511] : 0.5f;// branch prediction should optimize this!
-    }
-}; // DitherLUT class
-
-} // end nanovdb namespace
-
-#endif // NANOVDB_DITHERLUT_HAS_BEEN_INCLUDED
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/math/DitherLUT.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/math/DitherLUT.h instead.")
diff --git a/nanovdb/nanovdb/util/ForEach.h b/nanovdb/nanovdb/util/ForEach.h
index fcd8eae15b..f4c20f2bce 100644
--- a/nanovdb/nanovdb/util/ForEach.h
+++ b/nanovdb/nanovdb/util/ForEach.h
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: MPL-2.0
 
 /*!
-    \file ForEach.h
+    \file nanovdb/util/ForEach.h
 
     \author Ken Museth
 
@@ -11,10 +11,10 @@
     \brief A unified wrapper for tbb::parallel_for and a naive std::thread fallback
 */
 
-#ifndef NANOVDB_FOREACH_H_HAS_BEEN_INCLUDED
-#define NANOVDB_FOREACH_H_HAS_BEEN_INCLUDED
+#ifndef NANOVDB_UTIL_FOREACH_H_HAS_BEEN_INCLUDED
+#define NANOVDB_UTIL_FOREACH_H_HAS_BEEN_INCLUDED
 
-#include "Range.h"// for Range1D
+#include <nanovdb/util/Range.h>// for Range1D
 
 #ifdef NANOVDB_USE_TBB
 #include <tbb/parallel_for.h>
@@ -26,6 +26,8 @@
 
 namespace nanovdb {
 
+namespace util {
+
 /// @brief simple wrapper for tbb::parallel_for with a naive std fallback
 ///
 /// @param range Range, CoordBBox, tbb::blocked_range, blocked_range2D, or blocked_range3D.
@@ -83,6 +85,32 @@ inline void forEach(const ContainerT<T...> &c, size_t grainSize, const FuncT& fu
     forEach(Range1D(0, c.size(), grainSize), func);
 }
 
+}// namespace util
+
+/// @brief Simple wrapper for the function defined above
+template <typename FuncT>
+[[deprecated("Use nanovdb::util::forEach instead")]]
+inline void forEach(size_t begin, size_t end, size_t grainSize, const FuncT& func)
+{
+    util::forEach(util::Range1D(begin, end, grainSize), func);
+}
+
+/// @brief Simple wrapper for the function defined above, which works with std::containers
+template <template<typename...> class ContainerT, typename... T, typename FuncT>
+[[deprecated("Use nanovdb::util::forEach instead")]]
+inline void forEach(const ContainerT<T...> &c, const FuncT& func)
+{
+    util::forEach(util::Range1D(0, c.size(), 1), func);
+}
+
+/// @brief Simple wrapper for the function defined above, which works with std::containers
+template <template<typename...> class ContainerT, typename... T, typename FuncT>
+[[deprecated("Use nanovdb::util::forEach instead")]]
+inline void forEach(const ContainerT<T...> &c, size_t grainSize, const FuncT& func)
+{
+    util::forEach(util::Range1D(0, c.size(), grainSize), func);
+}
+
 }// namespace nanovdb
 
-#endif // NANOVDB_FOREACH_H_HAS_BEEN_INCLUDED
+#endif // NANOVDB_UTIL_FOREACH_H_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/util/GridBuilder.h b/nanovdb/nanovdb/util/GridBuilder.h
index 30fba27f94..bc1ce63eb5 100644
--- a/nanovdb/nanovdb/util/GridBuilder.h
+++ b/nanovdb/nanovdb/util/GridBuilder.h
@@ -1,2314 +1,6 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-/*!
-    \file GridBuilder.h
-
-    \author Ken Museth
-
-    \date June 26, 2020
-
-    \brief This file defines a minimum set of tree nodes and tools that
-           can be used (instead of OpenVDB) to build nanovdb grids on the CPU.
-*/
-
-#ifndef NANOVDB_GRID_BUILDER_H_HAS_BEEN_INCLUDED
-#define NANOVDB_GRID_BUILDER_H_HAS_BEEN_INCLUDED
-
-#include <iostream>
-
-#include <map>
-#include <limits>
-#include <sstream> // for stringstream
-#include <vector>
-#include <cstring> // for memcpy
-#include <mutex>
-#include <array>
-#include <atomic>
-
-#include <nanovdb/NanoVDB.h>
-#include "Range.h"
-#include "ForEach.h"
-
-namespace nanovdb {
-
-namespace build {
-
-// ----------------------------> Froward decelerations of random access methods <--------------------------------------
-
-template <typename T> struct GetValue;
-template <typename T> struct SetValue;
-template <typename T> struct TouchLeaf;
-template <typename T> struct GetState;
-template <typename T> struct ProbeValue;
-
-// ----------------------------> RootNode <--------------------------------------
-
-template<typename ChildT>
-struct RootNode
-{
-    using ValueType = typename ChildT::ValueType;
-    using BuildType = typename ChildT::BuildType;
-    using ChildNodeType = ChildT;
-    using LeafNodeType = typename ChildT::LeafNodeType;
-    static constexpr uint32_t LEVEL = 1 + ChildT::LEVEL; // level 0 = leaf
-    struct Tile {
-        Tile(ChildT* c = nullptr) : child(c) {}
-        Tile(const ValueType& v, bool s) : child(nullptr), value(v), state(s) {}
-        bool isChild() const { return child!=nullptr; }
-        bool isValue() const { return child==nullptr; }
-        bool isActive() const { return child==nullptr && state; }
-        ChildT*   child;
-        ValueType value;
-        bool      state;
-    };
-    using MapT = std::map<Coord, Tile>;
-    MapT      mTable;
-    ValueType mBackground;
-
-    Tile* probeTile(const Coord &ijk) {
-        auto iter = mTable.find(CoordToKey(ijk));
-        return iter == mTable.end() ? nullptr : &(iter->second);
-    }
-
-    const Tile* probeTile(const Coord &ijk) const {
-        auto iter = mTable.find(CoordToKey(ijk));
-        return iter == mTable.end() ? nullptr : &(iter->second);
-    }
-
-    class ChildIterator
-    {
-        const RootNode *mParent;
-        typename MapT::const_iterator mIter;
-    public:
-        ChildIterator() : mParent(nullptr), mIter() {}
-        ChildIterator(const RootNode *parent) : mParent(parent), mIter(parent->mTable.begin()) {
-            while (mIter!=parent->mTable.end() && mIter->second.child==nullptr) ++mIter;
-        }
-        ChildIterator& operator=(const ChildIterator&) = default;
-        ChildT& operator*() const {NANOVDB_ASSERT(*this); return *mIter->second.child;}
-        ChildT* operator->() const {NANOVDB_ASSERT(*this); return mIter->second.child;}
-        Coord getOrigin() const { NANOVDB_ASSERT(*this); return mIter->first;}
-        Coord getCoord() const { NANOVDB_ASSERT(*this); return mIter->first;}
-        operator bool() const {return mParent && mIter!=mParent->mTable.end();}
-        ChildIterator& operator++() {
-            NANOVDB_ASSERT(mParent);
-            ++mIter;
-            while (mIter!=mParent->mTable.end() && mIter->second.child==nullptr) ++mIter;
-            return *this;
-        }
-        ChildIterator operator++(int) {
-            auto tmp = *this;
-            ++(*this);
-            return tmp;
-        }
-        uint32_t pos() const {
-            NANOVDB_ASSERT(mParent);
-            return uint32_t(std::distance(mParent->mTable.begin(), mIter));
-        }
-    }; // Member class ChildIterator
-
-    ChildIterator  cbeginChild()  const {return ChildIterator(this);}
-    ChildIterator cbeginChildOn() const {return ChildIterator(this);}// match openvdb
-
-    class ValueIterator
-    {
-        const RootNode *mParent;
-        typename MapT::const_iterator mIter;
-    public:
-        ValueIterator() : mParent(nullptr), mIter() {}
-        ValueIterator(const RootNode *parent) : mParent(parent), mIter(parent->mTable.begin()) {
-            while (mIter!=parent->mTable.end() && mIter->second.child!=nullptr) ++mIter;
-        }
-        ValueIterator& operator=(const ValueIterator&) = default;
-        ValueType operator*() const {NANOVDB_ASSERT(*this); return mIter->second.value;}
-        bool isActive() const {NANOVDB_ASSERT(*this); return mIter->second.state;}
-        Coord getOrigin() const { NANOVDB_ASSERT(*this); return mIter->first;}
-        Coord getCoord() const { NANOVDB_ASSERT(*this); return mIter->first;}
-        operator bool() const {return mParent && mIter!=mParent->mTable.end();}
-        ValueIterator& operator++() {
-            NANOVDB_ASSERT(mParent);
-            ++mIter;
-            while (mIter!=mParent->mTable.end() && mIter->second.child!=nullptr) ++mIter;
-            return *this;;
-        }
-        ValueIterator operator++(int) {
-            auto tmp = *this;
-            ++(*this);
-            return tmp;
-        }
-        uint32_t pos() const {
-            NANOVDB_ASSERT(mParent);
-            return uint32_t(std::distance(mParent->mTable.begin(), mIter));
-        }
-    }; // Member class ValueIterator
-
-    ValueIterator  beginValue()          {return ValueIterator(this);}
-    ValueIterator cbeginValueAll() const {return ValueIterator(this);}
-
-    class ValueOnIterator
-    {
-        const RootNode *mParent;
-        typename MapT::const_iterator mIter;
-    public:
-        ValueOnIterator() : mParent(nullptr), mIter() {}
-        ValueOnIterator(const RootNode *parent) : mParent(parent), mIter(parent->mTable.begin()) {
-            while (mIter!=parent->mTable.end() && (mIter->second.child!=nullptr || !mIter->second.state)) ++mIter;
-        }
-        ValueOnIterator& operator=(const ValueOnIterator&) = default;
-        ValueType operator*() const {NANOVDB_ASSERT(*this); return mIter->second.value;}
-        Coord getOrigin() const { NANOVDB_ASSERT(*this); return mIter->first;}
-        Coord getCoord() const { NANOVDB_ASSERT(*this); return mIter->first;}
-        operator bool() const {return mParent && mIter!=mParent->mTable.end();}
-        ValueOnIterator& operator++() {
-            NANOVDB_ASSERT(mParent);
-            ++mIter;
-            while (mIter!=mParent->mTable.end() && (mIter->second.child!=nullptr || !mIter->second.state)) ++mIter;
-            return *this;;
-        }
-        ValueOnIterator operator++(int) {
-            auto tmp = *this;
-            ++(*this);
-            return tmp;
-        }
-        uint32_t pos() const {
-            NANOVDB_ASSERT(mParent);
-            return uint32_t(std::distance(mParent->mTable.begin(), mIter));
-        }
-    }; // Member class ValueOnIterator
-
-    ValueOnIterator  beginValueOn()       {return ValueOnIterator(this);}
-    ValueOnIterator cbeginValueOn() const {return ValueOnIterator(this);}
-
-    class TileIterator
-    {
-        const RootNode *mParent;
-        typename MapT::const_iterator mIter;
-    public:
-        TileIterator() : mParent(nullptr), mIter() {}
-        TileIterator(const RootNode *parent) : mParent(parent), mIter(parent->mTable.begin()) {
-            NANOVDB_ASSERT(mParent);
-        }
-        TileIterator& operator=(const TileIterator&) = default;
-        const Tile& operator*() const {NANOVDB_ASSERT(*this); return mIter->second;}
-        const Tile* operator->() const {NANOVDB_ASSERT(*this); return &(mIter->second);}
-        Coord getOrigin() const { NANOVDB_ASSERT(*this); return mIter->first;}
-        Coord getCoord() const { NANOVDB_ASSERT(*this); return mIter->first;}
-        operator bool() const {return mParent && mIter!=mParent->mTable.end();}
-        const ChildT* probeChild(ValueType &value) {
-            NANOVDB_ASSERT(*this);
-            const ChildT *child = mIter->second.child;
-            if (child==nullptr) value = mIter->second.value;
-            return child;
-        }
-        bool isValueOn() const {return mIter->second.child==nullptr && mIter->second.state;}
-        TileIterator& operator++() {
-            NANOVDB_ASSERT(mParent);
-            ++mIter;
-            return *this;
-        }
-        TileIterator operator++(int) {
-            auto tmp = *this;
-            ++(*this);
-            return tmp;
-        }
-        uint32_t pos() const {
-            NANOVDB_ASSERT(mParent);
-            return uint32_t(std::distance(mParent->mTable.begin(), mIter));
-        }
-    }; // Member class TileIterator
-
-    TileIterator  beginTile()           {return TileIterator(this);}
-    TileIterator cbeginChildAll() const {return TileIterator(this);}
-
-    //class DenseIterator : public TileIterator
-
-    RootNode(const ValueType& background) : mBackground(background) {}
-    RootNode(const RootNode&) = delete; // disallow copy-construction
-    RootNode(RootNode&&) = default; // allow move construction
-    RootNode& operator=(const RootNode&) = delete; // disallow copy assignment
-    RootNode& operator=(RootNode&&) = default; // allow move assignment
-
-    ~RootNode() { this->clear(); }
-
-    uint32_t tileCount()    const { return uint32_t(mTable.size()); }
-    uint32_t getTableSize() const { return uint32_t(mTable.size()); }// match openvdb
-    const ValueType& background() const {return mBackground;}
-
-    void nodeCount(std::array<size_t,3> &count) const
-    {
-        for (auto it = this->cbeginChild(); it; ++it) {
-            count[ChildT::LEVEL] += 1;
-            it->nodeCount(count);
-        }
-    }
-
-    bool empty() const { return mTable.empty(); }
-
-    void clear()
-    {
-        for (auto iter = mTable.begin(); iter != mTable.end(); ++iter) delete iter->second.child;
-        mTable.clear();
-    }
-
-    static Coord CoordToKey(const Coord& ijk) { return ijk & ~ChildT::MASK; }
-
-#ifdef NANOVDB_NEW_ACCESSOR_METHODS
-    template<typename OpT, typename... ArgsT>
-    auto get(const Coord& ijk, ArgsT&&... args) const
-    {
-        if (const Tile *tile = this->probeTile(ijk)) {
-            if (auto *child = tile->child) return child->template get<OpT>(ijk, args...);
-            return OpT::get(*tile, args...);
-        }
-        return OpT::get(*this, args...);
-    }
-    template<typename OpT, typename... ArgsT>
-    auto set(const Coord& ijk, ArgsT&&... args)
-    {
-        ChildT* child = nullptr;
-        const Coord key = CoordToKey(ijk);
-        auto iter = mTable.find(key);
-        if (iter == mTable.end()) {
-            child = new ChildT(ijk, mBackground, false);
-            mTable[key] = Tile(child);
-        } else if (iter->second.child != nullptr) {
-            child = iter->second.child;
-        } else {
-            child = new ChildT(ijk, iter->second.value, iter->second.state);
-            iter->second.child = child;
-        }
-        NANOVDB_ASSERT(child);
-        return child->template set<OpT>(ijk, args...);
-    }
-    template<typename OpT, typename AccT, typename... ArgsT>
-    auto getAndCache(const Coord& ijk, const AccT& acc, ArgsT&&... args) const
-    {
-        if (const Tile *tile = this->probeTile(ijk)) {
-            if (auto *child = tile->child) {
-                acc.insert(ijk, child);
-                return child->template get<OpT>(ijk, args...);
-            }
-            return OpT::get(*tile, args...);
-        }
-        return OpT::get(*this, args...);
-    }
-
-    template<typename OpT, typename AccT, typename... ArgsT>
-    auto setAndCache(const Coord& ijk, const AccT& acc, ArgsT&&... args)
-    {
-        ChildT* child = nullptr;
-        const Coord key = CoordToKey(ijk);
-        auto iter = mTable.find(key);
-        if (iter == mTable.end()) {
-            child = new ChildT(ijk, mBackground, false);
-            mTable[key] = Tile(child);
-        } else if (iter->second.child != nullptr) {
-            child = iter->second.child;
-        } else {
-            child = new ChildT(ijk, iter->second.value, iter->second.state);
-            iter->second.child = child;
-        }
-        NANOVDB_ASSERT(child);
-        acc.insert(ijk, child);
-        return child->template setAndCache<OpT>(ijk, acc, args...);
-    }
-    ValueType getValue(const Coord& ijk) const {return this->template get<GetValue<BuildType>>(ijk);}
-    ValueType getValue(int i, int j, int k) const {return this->template get<GetValue<BuildType>>(Coord(i,j,k));}
-    ValueType operator()(const Coord& ijk) const {return this->template get<GetValue<BuildType>>(ijk);}
-    ValueType operator()(int i, int j, int k) const {return this->template get<GetValue<BuildType>>(Coord(i,j,k));}
-    void setValue(const Coord& ijk, const ValueType& value) {this->template set<SetValue<BuildType>>(ijk, value);}
-    bool probeValue(const Coord& ijk, ValueType& value) const {return this->template get<ProbeValue<BuildType>>(ijk, value);}
-    bool isActive(const Coord& ijk) const {return this->template get<GetState<BuildType>>(ijk);}
-#else
-    ValueType getValue(const Coord& ijk) const
-    {
-#if 1
-        if (auto *tile = this->probeTile(ijk)) return tile->child ? tile->child->getValue(ijk) : tile->value;
-        return mBackground;
-#else
-        auto iter = mTable.find(CoordToKey(ijk));
-        if (iter == mTable.end()) {
-            return mBackground;
-        } else if (iter->second.child) {
-            return iter->second.child->getValue(ijk);
-        } else {
-            return iter->second.value;
-        }
-#endif
-    }
-    ValueType getValue(int i, int j, int k) const {return this->getValue(Coord(i,j,k));}
-
-    void setValue(const Coord& ijk, const ValueType& value)
-    {
-        ChildT* child = nullptr;
-        const Coord key = CoordToKey(ijk);
-        auto iter = mTable.find(key);
-        if (iter == mTable.end()) {
-            child = new ChildT(ijk, mBackground, false);
-            mTable[key] = Tile(child);
-        } else if (iter->second.child != nullptr) {
-            child = iter->second.child;
-        } else {
-            child = new ChildT(ijk, iter->second.value, iter->second.state);
-            iter->second.child = child;
-        }
-        NANOVDB_ASSERT(child);
-        child->setValue(ijk, value);
-    }
-
-    template<typename AccT>
-    bool isActiveAndCache(const Coord& ijk, AccT& acc) const
-    {
-        auto iter = mTable.find(CoordToKey(ijk));
-        if (iter == mTable.end())
-            return false;
-        if (iter->second.child) {
-            acc.insert(ijk, iter->second.child);
-            return iter->second.child->isActiveAndCache(ijk, acc);
-        }
-        return iter->second.state;
-    }
-
-    template<typename AccT>
-    ValueType getValueAndCache(const Coord& ijk, AccT& acc) const
-    {
-        auto iter = mTable.find(CoordToKey(ijk));
-        if (iter == mTable.end())
-            return mBackground;
-        if (iter->second.child) {
-            acc.insert(ijk, iter->second.child);
-            return iter->second.child->getValueAndCache(ijk, acc);
-        }
-        return iter->second.value;
-    }
-
-    template<typename AccT>
-    void setValueAndCache(const Coord& ijk, const ValueType& value, AccT& acc)
-    {
-        ChildT* child = nullptr;
-        const Coord key = CoordToKey(ijk);
-        auto iter = mTable.find(key);
-        if (iter == mTable.end()) {
-            child = new ChildT(ijk, mBackground, false);
-            mTable[key] = Tile(child);
-        } else if (iter->second.child != nullptr) {
-            child = iter->second.child;
-        } else {
-            child = new ChildT(ijk, iter->second.value, iter->second.state);
-            iter->second.child = child;
-        }
-        NANOVDB_ASSERT(child);
-        acc.insert(ijk, child);
-        child->setValueAndCache(ijk, value, acc);
-    }
-    template<typename AccT>
-    void setValueOnAndCache(const Coord& ijk, AccT& acc)
-    {
-        ChildT* child = nullptr;
-        const Coord key = CoordToKey(ijk);
-        auto iter = mTable.find(key);
-        if (iter == mTable.end()) {
-            child = new ChildT(ijk, mBackground, false);
-            mTable[key] = Tile(child);
-        } else if (iter->second.child != nullptr) {
-            child = iter->second.child;
-        } else {
-            child = new ChildT(ijk, iter->second.value, iter->second.state);
-            iter->second.child = child;
-        }
-        NANOVDB_ASSERT(child);
-        acc.insert(ijk, child);
-        child->setValueOnAndCache(ijk, acc);
-    }
-    template<typename AccT>
-    void touchLeafAndCache(const Coord &ijk, AccT& acc)
-    {
-        ChildT* child = nullptr;
-        const Coord key = CoordToKey(ijk);
-        auto iter = mTable.find(key);
-        if (iter == mTable.end()) {
-            child = new ChildT(ijk, mBackground, false);
-            mTable[key] = Tile(child);
-        } else if (iter->second.child != nullptr) {
-            child = iter->second.child;
-        } else {
-            child = new ChildT(ijk, iter->second.value, iter->second.state);
-            iter->second.child = child;
-        }
-        acc.insert(ijk, child);
-        child->touchLeafAndCache(ijk, acc);
-    }
-#endif// NANOVDB_NEW_ACCESSOR_METHODS
-
-    template<typename NodeT>
-    uint32_t nodeCount() const
-    {
-        static_assert(is_same<ValueType, typename NodeT::ValueType>::value, "Root::getNodes: Invalid type");
-        static_assert(NodeT::LEVEL < LEVEL, "Root::getNodes: LEVEL error");
-        uint32_t sum = 0;
-        for (auto iter = mTable.begin(); iter != mTable.end(); ++iter) {
-            if (iter->second.child == nullptr) continue; // skip tiles
-            if constexpr(is_same<NodeT, ChildT>::value) { //resolved at compile-time
-                ++sum;
-            } else {
-                sum += iter->second.child->template nodeCount<NodeT>();
-            }
-        }
-        return sum;
-    }
-
-    template<typename NodeT>
-    void getNodes(std::vector<NodeT*>& array)
-    {
-        static_assert(is_same<ValueType, typename NodeT::ValueType>::value, "Root::getNodes: Invalid type");
-        static_assert(NodeT::LEVEL < LEVEL, "Root::getNodes: LEVEL error");
-        for (auto iter = mTable.begin(); iter != mTable.end(); ++iter) {
-            if (iter->second.child == nullptr)
-                continue;
-            if constexpr(is_same<NodeT, ChildT>::value) { //resolved at compile-time
-                array.push_back(reinterpret_cast<NodeT*>(iter->second.child));
-            } else {
-                iter->second.child->getNodes(array);
-            }
-        }
-    }
-
-    void addChild(ChildT*& child)
-    {
-        NANOVDB_ASSERT(child);
-        const Coord key = CoordToKey(child->mOrigin);
-        auto iter = mTable.find(key);
-        if (iter != mTable.end() && iter->second.child != nullptr) { // existing child node
-            delete iter->second.child;
-            iter->second.child = child;
-        } else {
-            mTable[key] = Tile(child);
-        }
-        child = nullptr;
-    }
-
-    /// @brief Add a tile containing voxel (i, j, k) at the specified tree level,
-    /// creating a new branch if necessary.  Delete any existing lower-level nodes
-    /// that contain (x, y, z).
-    /// @tparam level tree level at which the tile is inserted. Must be 1, 2 or 3.
-    /// @param ijk Index coordinate that map to the tile being inserted
-    /// @param value Value of the tile
-    /// @param state Binary state of the tile
-    template <uint32_t level>
-    void addTile(const Coord& ijk, const ValueType& value, bool state)
-    {
-        static_assert(level > 0 && level <= LEVEL, "invalid template value of level");
-        const Coord key = CoordToKey(ijk);
-        auto        iter = mTable.find(key);
-        if constexpr(level == LEVEL) {
-            if (iter == mTable.end()) {
-                mTable[key] = Tile(value, state);
-            } else if (iter->second.child == nullptr) {
-                iter->second.value = value;
-                iter->second.state = state;
-            } else {
-                delete iter->second.child;
-                iter->second.child = nullptr;
-                iter->second.value = value;
-                iter->second.state = state;
-            }
-        } else if constexpr(level < LEVEL) {
-            ChildT* child = nullptr;
-            if (iter == mTable.end()) {
-                child = new ChildT(ijk, mBackground, false);
-                mTable[key] = Tile(child);
-            } else if (iter->second.child != nullptr) {
-                child = iter->second.child;
-            } else {
-                child = new ChildT(ijk, iter->second.value, iter->second.state);
-                iter->second.child = child;
-            }
-            child->template addTile<level>(ijk, value, state);
-        }
-    }
-
-    template<typename NodeT>
-    void addNode(NodeT*& node)
-    {
-        if constexpr(is_same<NodeT, ChildT>::value) { //resolved at compile-time
-            this->addChild(reinterpret_cast<ChildT*&>(node));
-        } else {
-            ChildT*     child = nullptr;
-            const Coord key = CoordToKey(node->mOrigin);
-            auto        iter = mTable.find(key);
-            if (iter == mTable.end()) {
-                child = new ChildT(node->mOrigin, mBackground, false);
-                mTable[key] = Tile(child);
-            } else if (iter->second.child != nullptr) {
-                child = iter->second.child;
-            } else {
-                child = new ChildT(node->mOrigin, iter->second.value, iter->second.state);
-                iter->second.child = child;
-            }
-            child->addNode(node);
-        }
-    }
-
-    void merge(RootNode &other)
-    {
-        for (auto iter1 = other.mTable.begin(); iter1 != other.mTable.end(); ++iter1) {
-            if (iter1->second.child == nullptr) continue;// ignore input tiles
-            auto iter2 = mTable.find(iter1->first);
-            if (iter2 == mTable.end() || iter2->second.child == nullptr) {
-                mTable[iter1->first] = Tile(iter1->second.child);
-                iter1->second.child = nullptr;
-            } else {
-                iter2->second.child->merge(*iter1->second.child);
-            }
-        }
-        other.clear();
-    }
-
-    template<typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
-    signedFloodFill(T outside);
-
-}; // build::RootNode
-
-//================================================================================================
-
-template<typename ChildT>
-template<typename T>
-inline typename std::enable_if<std::is_floating_point<T>::value>::type
-RootNode<ChildT>::signedFloodFill(T outside)
-{
-    std::map<Coord, ChildT*> nodeKeys;
-    for (auto iter = mTable.begin(); iter != mTable.end(); ++iter) {
-        if (iter->second.child == nullptr)
-            continue;
-        nodeKeys.insert(std::pair<Coord, ChildT*>(iter->first, iter->second.child));
-    }
-
-    // We employ a simple z-scanline algorithm that inserts inactive tiles with
-    // the inside value if they are sandwiched between inside child nodes only!
-    auto b = nodeKeys.begin(), e = nodeKeys.end();
-    if (b == e)
-        return;
-    for (auto a = b++; b != e; ++a, ++b) {
-        Coord d = b->first - a->first; // delta of neighboring coordinates
-        if (d[0] != 0 || d[1] != 0 || d[2] == int(ChildT::DIM))
-            continue; // not same z-scanline or neighbors
-        const ValueType fill[] = {a->second->getLastValue(), b->second->getFirstValue()};
-        if (!(fill[0] < 0) || !(fill[1] < 0))
-            continue; // scanline isn't inside
-        Coord c = a->first + Coord(0u, 0u, ChildT::DIM);
-        for (; c[2] != b->first[2]; c[2] += ChildT::DIM) {
-            const Coord key = RootNode<ChildT>::CoordToKey(c);
-            mTable[key] = typename RootNode<ChildT>::Tile(-outside, false); // inactive tile
-        }
-    }
-} // build::RootNode::signedFloodFill
-
-// ----------------------------> InternalNode <--------------------------------------
-
-template<typename ChildT>
-struct InternalNode
-{
-    using ValueType = typename ChildT::ValueType;
-    using BuildType = typename ChildT::BuildType;
-    using ChildNodeType = ChildT;
-    using LeafNodeType = typename ChildT::LeafNodeType;
-    static constexpr uint32_t LOG2DIM = ChildT::LOG2DIM + 1;
-    static constexpr uint32_t TOTAL = LOG2DIM + ChildT::TOTAL; //dimension in index space
-    static constexpr uint32_t DIM = 1u << TOTAL;
-    static constexpr uint32_t SIZE = 1u << (3 * LOG2DIM); //number of tile values (or child pointers)
-    static constexpr uint32_t MASK = DIM - 1;
-    static constexpr uint32_t LEVEL = 1 + ChildT::LEVEL; // level 0 = leaf
-    static constexpr uint64_t NUM_VALUES = uint64_t(1) << (3 * TOTAL); // total voxel count represented by this node
-    using MaskT = Mask<LOG2DIM>;
-    template<bool On>
-    using MaskIterT = typename MaskT::template Iterator<On>;
-    using NanoNodeT = typename NanoNode<BuildType, LEVEL>::Type;
-
-    struct Tile {
-        Tile(ChildT* c = nullptr) : child(c) {}
-        Tile(const ValueType& v) : value(v) {}
-        union{
-            ChildT*   child;
-            ValueType value;
-        };
-    };
-    Coord      mOrigin;
-    MaskT      mValueMask;
-    MaskT      mChildMask;
-    Tile       mTable[SIZE];
-
-    union {
-        NanoNodeT *mDstNode;
-        uint64_t   mDstOffset;
-    };
-
-    /// @brief Visits child nodes of this node only
-    class ChildIterator : public MaskIterT<true>
-    {
-        using BaseT = MaskIterT<true>;
-        const InternalNode *mParent;
-    public:
-        ChildIterator() : BaseT(), mParent(nullptr) {}
-        ChildIterator(const InternalNode* parent) : BaseT(parent->mChildMask.beginOn()), mParent(parent) {}
-        ChildIterator& operator=(const ChildIterator&) = default;
-        const ChildT& operator*() const {NANOVDB_ASSERT(*this); return *mParent->mTable[BaseT::pos()].child;}
-        const ChildT* operator->() const {NANOVDB_ASSERT(*this); return mParent->mTable[BaseT::pos()].child;}
-        Coord getCoord() const { NANOVDB_ASSERT(*this); return (*this)->origin();}
-    }; // Member class ChildIterator
-
-    ChildIterator  beginChild()         {return ChildIterator(this);}
-    ChildIterator cbeginChildOn() const {return ChildIterator(this);}// match openvdb
-
-     /// @brief Visits all tile values in this node, i.e. both inactive and active tiles
-    class ValueIterator : public MaskIterT<false>
-    {
-        using BaseT = MaskIterT<false>;
-        const InternalNode *mParent;
-    public:
-        ValueIterator() : BaseT(), mParent(nullptr) {}
-        ValueIterator(const InternalNode* parent) :  BaseT(parent->mChildMask.beginOff()), mParent(parent) {}
-        ValueIterator& operator=(const ValueIterator&) = default;
-        ValueType operator*() const {NANOVDB_ASSERT(*this); return mParent->mTable[BaseT::pos()].value;}
-        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(BaseT::pos());}
-        bool isActive() const { NANOVDB_ASSERT(*this); return mParent->mValueMask.isOn(BaseT::pos());}
-    }; // Member class ValueIterator
-
-    ValueIterator  beginValue()          {return ValueIterator(this);}
-    ValueIterator cbeginValueAll() const {return ValueIterator(this);}
-
-    /// @brief Visits active tile values of this node only
-    class ValueOnIterator : public MaskIterT<true>
-    {
-        using BaseT = MaskIterT<true>;
-        const InternalNode *mParent;
-    public:
-        ValueOnIterator() : BaseT(), mParent(nullptr) {}
-        ValueOnIterator(const InternalNode* parent) :  BaseT(parent->mValueMask.beginOn()), mParent(parent) {}
-        ValueOnIterator& operator=(const ValueOnIterator&) = default;
-        ValueType operator*() const {NANOVDB_ASSERT(*this); return mParent->mTable[BaseT::pos()].value;}
-        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(BaseT::pos());}
-    }; // Member class ValueOnIterator
-
-    ValueOnIterator  beginValueOn()       {return ValueOnIterator(this);}
-    ValueOnIterator cbeginValueOn() const {return ValueOnIterator(this);}
-
-    /// @brief Visits all tile values and child nodes of this node
-    class DenseIterator : public MaskT::DenseIterator
-    {
-        using BaseT = typename MaskT::DenseIterator;
-        const InternalNode *mParent;
-    public:
-        DenseIterator() : BaseT(), mParent(nullptr) {}
-        DenseIterator(const InternalNode* parent) :  BaseT(0), mParent(parent) {}
-        DenseIterator& operator=(const DenseIterator&) = default;
-        ChildT* probeChild(ValueType& value) const
-        {
-            NANOVDB_ASSERT(mParent && bool(*this));
-            ChildT *child = nullptr;
-            if (mParent->mChildMask.isOn(BaseT::pos())) {
-                child = mParent->mTable[BaseT::pos()].child;
-            } else {
-                value = mParent->mTable[BaseT::pos()].value;
-            }
-            return child;
-        }
-        Coord getCoord() const { NANOVDB_ASSERT(mParent && bool(*this)); return mParent->offsetToGlobalCoord(BaseT::pos());}
-    }; // Member class DenseIterator
-
-    DenseIterator     beginDense()       {return DenseIterator(this);}
-    DenseIterator cbeginChildAll() const {return DenseIterator(this);}// matches openvdb
-
-    InternalNode(const Coord& origin, const ValueType& value, bool state)
-        : mOrigin(origin & ~MASK)
-        , mValueMask(state)
-        , mChildMask()
-        , mDstOffset(0)
-    {
-        for (uint32_t i = 0; i < SIZE; ++i) mTable[i].value = value;
-    }
-    InternalNode(const InternalNode&) = delete; // disallow copy-construction
-    InternalNode(InternalNode&&) = delete; // disallow move construction
-    InternalNode& operator=(const InternalNode&) = delete; // disallow copy assignment
-    InternalNode& operator=(InternalNode&&) = delete; // disallow move assignment
-    ~InternalNode()
-    {
-        for (auto iter = mChildMask.beginOn(); iter; ++iter) {
-            delete mTable[*iter].child;
-        }
-    }
-    const MaskT& getValueMask() const {return mValueMask;}
-    const MaskT& valueMask() const {return mValueMask;}
-    const MaskT& getChildMask() const {return mChildMask;}
-    const MaskT& childMask() const {return mChildMask;}
-    const Coord& origin() const {return mOrigin;}
-
-    void nodeCount(std::array<size_t,3> &count) const
-    {
-        count[ChildT::LEVEL] += mChildMask.countOn();
-        if constexpr(ChildT::LEVEL>0) {
-            for (auto it = const_cast<InternalNode*>(this)->beginChild(); it; ++it) it->nodeCount(count);
-        }
-    }
-
-    static uint32_t CoordToOffset(const Coord& ijk)
-    {
-        return (((ijk[0] & int32_t(MASK)) >> ChildT::TOTAL) << (2 * LOG2DIM)) +
-               (((ijk[1] & int32_t(MASK)) >> ChildT::TOTAL) << (LOG2DIM)) +
-                ((ijk[2] & int32_t(MASK)) >> ChildT::TOTAL);
-    }
-
-    static Coord OffsetToLocalCoord(uint32_t n)
-    {
-        NANOVDB_ASSERT(n < SIZE);
-        const uint32_t m = n & ((1 << 2 * LOG2DIM) - 1);
-        return Coord(n >> 2 * LOG2DIM, m >> LOG2DIM, m & ((1 << LOG2DIM) - 1));
-    }
-
-    void localToGlobalCoord(Coord& ijk) const
-    {
-        ijk <<= ChildT::TOTAL;
-        ijk += mOrigin;
-    }
-
-    Coord offsetToGlobalCoord(uint32_t n) const
-    {
-        Coord ijk = InternalNode::OffsetToLocalCoord(n);
-        this->localToGlobalCoord(ijk);
-        return ijk;
-    }
-
-    ValueType getFirstValue() const { return mChildMask.isOn(0) ? mTable[0].child->getFirstValue() : mTable[0].value; }
-    ValueType getLastValue() const { return mChildMask.isOn(SIZE - 1) ? mTable[SIZE - 1].child->getLastValue() : mTable[SIZE - 1].value; }
-
-    template<typename OpT, typename... ArgsT>
-    auto get(const Coord& ijk, ArgsT&&... args) const
-    {
-        const uint32_t n = CoordToOffset(ijk);
-        if (mChildMask.isOn(n)) return mTable[n].child->template get<OpT>(ijk, args...);
-        return OpT::get(*this, n, args...);
-    }
-
-    template<typename OpT, typename... ArgsT>
-    auto set(const Coord& ijk, ArgsT&&... args)
-    {
-        const uint32_t n = CoordToOffset(ijk);
-        ChildT* child = nullptr;
-        if (mChildMask.isOn(n)) {
-            child = mTable[n].child;
-        } else {
-            child = new ChildT(ijk, mTable[n].value, mValueMask.isOn(n));
-            mTable[n].child = child;
-            mChildMask.setOn(n);
-        }
-        NANOVDB_ASSERT(child);
-        return child->template set<OpT>(ijk, args...);
-    }
-
-    template<typename OpT, typename AccT, typename... ArgsT>
-    auto getAndCache(const Coord& ijk, const AccT& acc, ArgsT&&... args) const
-    {
-        const uint32_t n = CoordToOffset(ijk);
-        if (mChildMask.isOff(n)) return OpT::get(*this, n, args...);
-        ChildT* child = mTable[n].child;
-        acc.insert(ijk, child);
-        if constexpr(ChildT::LEVEL == 0) {
-            return child->template get<OpT>(ijk, args...);
-        } else {
-            return child->template getAndCache<OpT>(ijk, acc, args...);
-        }
-    }
-
-    template<typename OpT, typename AccT, typename... ArgsT>
-    auto setAndCache(const Coord& ijk, const AccT& acc, ArgsT&&... args)
-    {
-        const uint32_t n = CoordToOffset(ijk);
-        ChildT* child = nullptr;
-        if (mChildMask.isOn(n)) {
-            child = mTable[n].child;
-        } else {
-            child = new ChildT(ijk, mTable[n].value, mValueMask.isOn(n));
-            mTable[n].child = child;
-            mChildMask.setOn(n);
-        }
-        NANOVDB_ASSERT(child);
-        acc.insert(ijk, child);
-        if constexpr(ChildT::LEVEL == 0) {
-            return child->template set<OpT>(ijk, args...);
-        } else {
-            return child->template setAndCache<OpT>(ijk, acc, args...);
-        }
-    }
-
-#ifdef NANOVDB_NEW_ACCESSOR_METHODS
-    ValueType getValue(const Coord& ijk) const {return this->template get<GetValue<BuildType>>(ijk);}
-    LeafNodeType& setValue(const Coord& ijk, const ValueType& value){return this->template set<SetValue<BuildType>>(ijk, value);}
-#else
-    ValueType getValue(const Coord& ijk) const
-    {
-        const uint32_t n = CoordToOffset(ijk);
-        if (mChildMask.isOn(n)) {
-            return mTable[n].child->getValue(ijk);
-        }
-        return mTable[n].value;
-    }
-    void setValue(const Coord& ijk, const ValueType& value)
-    {
-        const uint32_t n = CoordToOffset(ijk);
-        ChildT*        child = nullptr;
-        if (mChildMask.isOn(n)) {
-            child = mTable[n].child;
-        } else {
-            child = new ChildT(ijk, mTable[n].value, mValueMask.isOn(n));
-            mTable[n].child = child;
-            mChildMask.setOn(n);
-        }
-        child->setValue(ijk, value);
-    }
-
-    template<typename AccT>
-    ValueType getValueAndCache(const Coord& ijk, AccT& acc) const
-    {
-        const uint32_t n = CoordToOffset(ijk);
-        if (mChildMask.isOn(n)) {
-            acc.insert(ijk, const_cast<ChildT*>(mTable[n].child));
-            return mTable[n].child->getValueAndCache(ijk, acc);
-        }
-        return mTable[n].value;
-    }
-
-    template<typename AccT>
-    void setValueAndCache(const Coord& ijk, const ValueType& value, AccT& acc)
-    {
-        const uint32_t n = CoordToOffset(ijk);
-        ChildT*        child = nullptr;
-        if (mChildMask.isOn(n)) {
-            child = mTable[n].child;
-        } else {
-            child = new ChildT(ijk, mTable[n].value, mValueMask.isOn(n));
-            mTable[n].child = child;
-            mChildMask.setOn(n);
-        }
-        acc.insert(ijk, child);
-        child->setValueAndCache(ijk, value, acc);
-    }
-
-    template<typename AccT>
-    void setValueOnAndCache(const Coord& ijk, AccT& acc)
-    {
-        const uint32_t n = CoordToOffset(ijk);
-        ChildT*        child = nullptr;
-        if (mChildMask.isOn(n)) {
-            child = mTable[n].child;
-        } else {
-            child = new ChildT(ijk, mTable[n].value, mValueMask.isOn(n));
-            mTable[n].child = child;
-            mChildMask.setOn(n);
-        }
-        acc.insert(ijk, child);
-        child->setValueOnAndCache(ijk, acc);
-    }
-
-    template<typename AccT>
-    void touchLeafAndCache(const Coord &ijk, AccT& acc)
-    {
-        const uint32_t n = CoordToOffset(ijk);
-        ChildT* child = nullptr;
-        if (mChildMask.isOn(n)) {
-            child = mTable[n].child;
-        } else {
-            child = new ChildT(ijk, mTable[n].value, mValueMask.isOn(n));
-            mTable[n].child = child;
-            mChildMask.setOn(n);
-        }
-        acc.insert(ijk, child);
-        if constexpr(LEVEL>1) child->touchLeafAndCache(ijk, acc);
-    }
-    template<typename AccT>
-    bool isActiveAndCache(const Coord& ijk, AccT& acc) const
-    {
-        const uint32_t n = CoordToOffset(ijk);
-        if (mChildMask.isOn(n)) {
-            acc.insert(ijk, const_cast<ChildT*>(mTable[n].child));
-            return mTable[n].child->isActiveAndCache(ijk, acc);
-        }
-        return mValueMask.isOn(n);
-    }
-#endif
-
-    template<typename NodeT>
-    uint32_t nodeCount() const
-    {
-        static_assert(is_same<ValueType, typename NodeT::ValueType>::value, "Node::getNodes: Invalid type");
-        NANOVDB_ASSERT(NodeT::LEVEL < LEVEL);
-        uint32_t sum = 0;
-        if constexpr(is_same<NodeT, ChildT>::value) { // resolved at compile-time
-            sum += mChildMask.countOn();
-        } else if constexpr(LEVEL>1) {
-            for (auto iter = mChildMask.beginOn(); iter; ++iter) {
-                sum += mTable[*iter].child->template nodeCount<NodeT>();
-            }
-        }
-        return sum;
-    }
-
-    template<typename NodeT>
-    void getNodes(std::vector<NodeT*>& array)
-    {
-        static_assert(is_same<ValueType, typename NodeT::ValueType>::value, "Node::getNodes: Invalid type");
-        NANOVDB_ASSERT(NodeT::LEVEL < LEVEL);
-        for (auto iter = mChildMask.beginOn(); iter; ++iter) {
-            if constexpr(is_same<NodeT, ChildT>::value) { // resolved at compile-time
-                array.push_back(reinterpret_cast<NodeT*>(mTable[*iter].child));
-            } else if constexpr(LEVEL>1) {
-                mTable[*iter].child->getNodes(array);
-            }
-        }
-    }
-
-    void addChild(ChildT*& child)
-    {
-        NANOVDB_ASSERT(child && (child->mOrigin & ~MASK) == this->mOrigin);
-        const uint32_t n = CoordToOffset(child->mOrigin);
-        if (mChildMask.isOn(n)) {
-            delete mTable[n].child;
-        } else {
-            mChildMask.setOn(n);
-        }
-        mTable[n].child = child;
-        child = nullptr;
-    }
-
-    /// @brief Add a tile containing voxel (i, j, k) at the specified tree level,
-    /// creating a new branch if necessary.  Delete any existing lower-level nodes
-    /// that contain (x, y, z).
-    /// @tparam level tree level at which the tile is inserted. Must be 1 or 2.
-    /// @param ijk Index coordinate that map to the tile being inserted
-    /// @param value Value of the tile
-    /// @param state Binary state of the tile
-    template <uint32_t level>
-    void addTile(const Coord& ijk, const ValueType& value, bool state)
-    {
-        static_assert(level > 0 && level <= LEVEL, "invalid template value of level");
-        const uint32_t n = CoordToOffset(ijk);
-        if constexpr(level == LEVEL) {
-            if (mChildMask.isOn(n)) {
-                delete mTable[n].child;
-                mTable[n] = Tile(value);
-            } else {
-                mValueMask.set(n, state);
-                mTable[n].value = value;
-            }
-        } else if constexpr(level < LEVEL) {
-            ChildT* child = nullptr;
-            if (mChildMask.isOn(n)) {
-                child = mTable[n].child;
-            } else {
-                child = new ChildT(ijk, value, state);
-                mTable[n].child = child;
-                mChildMask.setOn(n);
-            }
-            child->template addTile<level>(ijk, value, state);
-        }
-    }
-
-    template<typename NodeT>
-    void addNode(NodeT*& node)
-    {
-        if constexpr(is_same<NodeT, ChildT>::value) { //resolved at compile-time
-            this->addChild(reinterpret_cast<ChildT*&>(node));
-        } else if constexpr(LEVEL>1) {
-            const uint32_t n = CoordToOffset(node->mOrigin);
-            ChildT*        child = nullptr;
-            if (mChildMask.isOn(n)) {
-                child = mTable[n].child;
-            } else {
-                child = new ChildT(node->mOrigin, mTable[n].value, mValueMask.isOn(n));
-                mTable[n].child = child;
-                mChildMask.setOn(n);
-            }
-            child->addNode(node);
-        }
-    }
-
-    void merge(InternalNode &other)
-    {
-        for (auto iter = other.mChildMask.beginOn(); iter; ++iter) {
-            const uint32_t n = *iter;
-            if (mChildMask.isOn(n)) {
-                mTable[n].child->merge(*other.mTable[n].child);
-            } else {
-                mTable[n].child = other.mTable[n].child;
-                other.mChildMask.setOff(n);
-                mChildMask.setOn(n);
-            }
-        }
-    }
-
-    template<typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
-    signedFloodFill(T outside);
-
-}; // build::InternalNode
-
-//================================================================================================
-
-template<typename ChildT>
-template<typename T>
-inline typename std::enable_if<std::is_floating_point<T>::value>::type
-InternalNode<ChildT>::signedFloodFill(T outside)
-{
-    const uint32_t first = *mChildMask.beginOn();
-    if (first < NUM_VALUES) {
-        bool xInside = mTable[first].child->getFirstValue() < 0;
-        bool yInside = xInside, zInside = xInside;
-        for (uint32_t x = 0; x != (1 << LOG2DIM); ++x) {
-            const uint32_t x00 = x << (2 * LOG2DIM); // offset for block(x, 0, 0)
-            if (mChildMask.isOn(x00)) {
-                xInside = mTable[x00].child->getLastValue() < 0;
-            }
-            yInside = xInside;
-            for (uint32_t y = 0; y != (1u << LOG2DIM); ++y) {
-                const uint32_t xy0 = x00 + (y << LOG2DIM); // offset for block(x, y, 0)
-                if (mChildMask.isOn(xy0))
-                    yInside = mTable[xy0].child->getLastValue() < 0;
-                zInside = yInside;
-                for (uint32_t z = 0; z != (1 << LOG2DIM); ++z) {
-                    const uint32_t xyz = xy0 + z; // offset for block(x, y, z)
-                    if (mChildMask.isOn(xyz)) {
-                        zInside = mTable[xyz].child->getLastValue() < 0;
-                    } else {
-                        mTable[xyz].value = zInside ? -outside : outside;
-                    }
-                }
-            }
-        }
-    }
-} // build::InternalNode::signedFloodFill
-
-// ----------------------------> LeafNode <--------------------------------------
-
-template<typename BuildT>
-struct LeafNode
-{
-    using BuildType = BuildT;
-    using ValueType = typename BuildToValueMap<BuildT>::type;
-    using LeafNodeType = LeafNode<BuildT>;
-    static constexpr uint32_t LOG2DIM = 3;
-    static constexpr uint32_t TOTAL = LOG2DIM; // needed by parent nodes
-    static constexpr uint32_t DIM = 1u << TOTAL;
-    static constexpr uint32_t SIZE = 1u << 3 * LOG2DIM; // total number of voxels represented by this node
-    static constexpr uint32_t MASK = DIM - 1; // mask for bit operations
-    static constexpr uint32_t LEVEL = 0; // level 0 = leaf
-    static constexpr uint64_t NUM_VALUES = uint64_t(1) << (3 * TOTAL); // total voxel count represented by this node
-    using NodeMaskType = Mask<LOG2DIM>;
-    template<bool ON>
-    using MaskIterT = typename Mask<LOG2DIM>::template Iterator<ON>;
-    using NanoLeafT = typename NanoNode<BuildT, 0>::Type;
-
-    Coord         mOrigin;
-    Mask<LOG2DIM> mValueMask;
-    ValueType     mValues[SIZE];
-    union {
-        NanoLeafT *mDstNode;
-        uint64_t   mDstOffset;
-    };
-
-    /// @brief Visits all active values in a leaf node
-    class ValueOnIterator : public MaskIterT<true>
-    {
-        using BaseT = MaskIterT<true>;
-        const LeafNode *mParent;
-    public:
-        ValueOnIterator() : BaseT(), mParent(nullptr) {}
-        ValueOnIterator(const LeafNode* parent) :  BaseT(parent->mValueMask.beginOn()), mParent(parent) {}
-        ValueOnIterator& operator=(const ValueOnIterator&) = default;
-        ValueType operator*() const {NANOVDB_ASSERT(*this); return mParent->mValues[BaseT::pos()];}
-        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(BaseT::pos());}
-    }; // Member class ValueOnIterator
-
-    ValueOnIterator  beginValueOn()       {return ValueOnIterator(this);}
-    ValueOnIterator cbeginValueOn() const {return ValueOnIterator(this);}
-
-    /// @brief Visits all inactive values in a leaf node
-    class ValueOffIterator : public MaskIterT<false>
-    {
-        using BaseT = MaskIterT<false>;
-        const LeafNode *mParent;
-    public:
-        ValueOffIterator() : BaseT(), mParent(nullptr) {}
-        ValueOffIterator(const LeafNode* parent) :  BaseT(parent->mValueMask.beginOff()), mParent(parent) {}
-        ValueOffIterator& operator=(const ValueOffIterator&) = default;
-        ValueType operator*() const {NANOVDB_ASSERT(*this); return mParent->mValues[BaseT::pos()];}
-        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(BaseT::pos());}
-    }; // Member class ValueOffIterator
-
-    ValueOffIterator  beginValueOff()       {return ValueOffIterator(this);}
-    ValueOffIterator cbeginValueOff() const {return ValueOffIterator(this);}
-
-    /// @brief Visits all values in a leaf node, i.e. both active and inactive values
-    class ValueIterator
-    {
-        const LeafNode *mParent;
-        uint32_t mPos;
-    public:
-        ValueIterator() : mParent(nullptr), mPos(1u << 3 * LOG2DIM) {}
-        ValueIterator(const LeafNode* parent) :  mParent(parent), mPos(0) {NANOVDB_ASSERT(parent);}
-        ValueIterator& operator=(const ValueIterator&) = default;
-        ValueType operator*() const { NANOVDB_ASSERT(*this); return mParent->mValues[mPos];}
-        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(mPos);}
-        bool isActive() const { NANOVDB_ASSERT(*this); return mParent->isActive(mPos);}
-        operator bool() const {return mPos < SIZE;}
-        ValueIterator& operator++() {++mPos; return *this;}
-        ValueIterator operator++(int) {
-            auto tmp = *this;
-            ++(*this);
-            return tmp;
-        }
-    }; // Member class ValueIterator
-
-    ValueIterator  beginValue()          {return ValueIterator(this);}
-    ValueIterator cbeginValueAll() const {return ValueIterator(this);}
-
-    LeafNode(const Coord& ijk, const ValueType& value, bool state)
-        : mOrigin(ijk & ~MASK)
-        , mValueMask(state) //invalid
-        , mDstOffset(0)
-    {
-        ValueType*  target = mValues;
-        uint32_t n = SIZE;
-        while (n--) {
-            *target++ = value;
-        }
-    }
-    LeafNode(const LeafNode&) = delete; // disallow copy-construction
-    LeafNode(LeafNode&&) = delete; // disallow move construction
-    LeafNode& operator=(const LeafNode&) = delete; // disallow copy assignment
-    LeafNode& operator=(LeafNode&&) = delete; // disallow move assignment
-    ~LeafNode() = default;
-
-    const Mask<LOG2DIM>& getValueMask() const {return mValueMask;}
-    const Mask<LOG2DIM>& valueMask() const {return mValueMask;}
-    const Coord& origin() const {return mOrigin;}
-
-    /// @brief Return the linear offset corresponding to the given coordinate
-    static uint32_t CoordToOffset(const Coord& ijk)
-    {
-        return ((ijk[0] & int32_t(MASK)) << (2 * LOG2DIM)) +
-               ((ijk[1] & int32_t(MASK)) << LOG2DIM) +
-                (ijk[2] & int32_t(MASK));
-    }
-
-    static Coord OffsetToLocalCoord(uint32_t n)
-    {
-        NANOVDB_ASSERT(n < SIZE);
-        const int32_t m = n & ((1 << 2 * LOG2DIM) - 1);
-        return Coord(n >> 2 * LOG2DIM, m >> LOG2DIM, m & int32_t(MASK));
-    }
-
-    void localToGlobalCoord(Coord& ijk) const
-    {
-        ijk += mOrigin;
-    }
-
-    Coord offsetToGlobalCoord(uint32_t n) const
-    {
-        Coord ijk = LeafNode::OffsetToLocalCoord(n);
-        this->localToGlobalCoord(ijk);
-        return ijk;
-    }
-
-    ValueType getFirstValue() const { return mValues[0]; }
-    ValueType getLastValue() const { return mValues[SIZE - 1]; }
-    const ValueType& getValue(uint32_t i) const {return mValues[i];}
-    const ValueType& getValue(const Coord& ijk) const {return mValues[CoordToOffset(ijk)];}
-
-    template<typename OpT, typename... ArgsT>
-    auto get(const Coord& ijk, ArgsT&&... args) const {return OpT::get(*this, CoordToOffset(ijk), args...);}
-
-    template<typename OpT, typename... ArgsT>
-    auto set(const Coord& ijk, ArgsT&&... args) {return OpT::set(*this, CoordToOffset(ijk), args...);}
-
-#ifndef NANOVDB_NEW_ACCESSOR_METHODS
-    template<typename AccT>
-    const ValueType& getValueAndCache(const Coord& ijk, const AccT&) const
-    {
-        return mValues[CoordToOffset(ijk)];
-    }
-
-    template<typename AccT>
-    void setValueAndCache(const Coord& ijk, const ValueType& value, const AccT&)
-    {
-        const uint32_t n = CoordToOffset(ijk);
-        mValueMask.setOn(n);
-        mValues[n] = value;
-    }
-
-    template<typename AccT>
-    void setValueOnAndCache(const Coord& ijk, const AccT&)
-    {
-        const uint32_t n = CoordToOffset(ijk);
-        mValueMask.setOn(n);
-    }
-
-    template<typename AccT>
-    bool isActiveAndCache(const Coord& ijk, const AccT&) const
-    {
-        return mValueMask.isOn(CoordToOffset(ijk));
-    }
-#endif
-
-    void setValue(uint32_t n, const ValueType& value)
-    {
-        mValueMask.setOn(n);
-        mValues[n] = value;
-    }
-    void setValue(const Coord& ijk, const ValueType& value){this->setValue(CoordToOffset(ijk), value);}
-
-    void merge(LeafNode &other)
-    {
-        other.mValueMask -= mValueMask;
-        for (auto iter = other.mValueMask.beginOn(); iter; ++iter) {
-            const uint32_t n = *iter;
-            mValues[n] = other.mValues[n];
-        }
-        mValueMask |= other.mValueMask;
-    }
-
-    template<typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
-    signedFloodFill(T outside);
-
-}; // build::LeafNode<T>
-
-//================================================================================================
-
-template <>
-struct LeafNode<ValueMask>
-{
-    using ValueType = bool;
-    using BuildType = ValueMask;
-    using LeafNodeType = LeafNode<BuildType>;
-    static constexpr uint32_t LOG2DIM = 3;
-    static constexpr uint32_t TOTAL = LOG2DIM; // needed by parent nodes
-    static constexpr uint32_t DIM = 1u << TOTAL;
-    static constexpr uint32_t SIZE = 1u << 3 * LOG2DIM; // total number of voxels represented by this node
-    static constexpr uint32_t MASK = DIM - 1; // mask for bit operations
-    static constexpr uint32_t LEVEL = 0; // level 0 = leaf
-    static constexpr uint64_t NUM_VALUES = uint64_t(1) << (3 * TOTAL); // total voxel count represented by this node
-    using NodeMaskType = Mask<LOG2DIM>;
-    template<bool ON>
-    using MaskIterT = typename Mask<LOG2DIM>::template Iterator<ON>;
-    using NanoLeafT = typename NanoNode<BuildType, 0>::Type;
-
-    Coord         mOrigin;
-    Mask<LOG2DIM> mValueMask;
-    union {
-        NanoLeafT *mDstNode;
-        uint64_t   mDstOffset;
-    };
-
-    /// @brief Visits all active values in a leaf node
-    class ValueOnIterator : public MaskIterT<true>
-    {
-        using BaseT = MaskIterT<true>;
-        const LeafNode *mParent;
-    public:
-        ValueOnIterator() : BaseT(), mParent(nullptr) {}
-        ValueOnIterator(const LeafNode* parent) :  BaseT(parent->mValueMask.beginOn()), mParent(parent) {}
-        ValueOnIterator& operator=(const ValueOnIterator&) = default;
-        bool operator*() const {NANOVDB_ASSERT(*this); return true;}
-        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(BaseT::pos());}
-    }; // Member class ValueOnIterator
-
-    ValueOnIterator  beginValueOn()       {return ValueOnIterator(this);}
-    ValueOnIterator cbeginValueOn() const {return ValueOnIterator(this);}
-
-    /// @brief Visits all inactive values in a leaf node
-    class ValueOffIterator : public MaskIterT<false>
-    {
-        using BaseT = MaskIterT<false>;
-        const LeafNode *mParent;
-    public:
-        ValueOffIterator() : BaseT(), mParent(nullptr) {}
-        ValueOffIterator(const LeafNode* parent) :  BaseT(parent->mValueMask.beginOff()), mParent(parent) {}
-        ValueOffIterator& operator=(const ValueOffIterator&) = default;
-        bool operator*() const {NANOVDB_ASSERT(*this); return false;}
-        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(BaseT::pos());}
-    }; // Member class ValueOffIterator
-
-    ValueOffIterator  beginValueOff()       {return ValueOffIterator(this);}
-    ValueOffIterator cbeginValueOff() const {return ValueOffIterator(this);}
-
-    /// @brief Visits all values in a leaf node, i.e. both active and inactive values
-    class ValueIterator
-    {
-        const LeafNode *mParent;
-        uint32_t mPos;
-    public:
-        ValueIterator() : mParent(nullptr), mPos(1u << 3 * LOG2DIM) {}
-        ValueIterator(const LeafNode* parent) :  mParent(parent), mPos(0) {NANOVDB_ASSERT(parent);}
-        ValueIterator& operator=(const ValueIterator&) = default;
-        bool operator*() const { NANOVDB_ASSERT(*this); return mParent->mValueMask.isOn(mPos);}
-        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(mPos);}
-        bool isActive() const { NANOVDB_ASSERT(*this); return mParent->mValueMask.isOn(mPos);}
-        operator bool() const {return mPos < SIZE;}
-        ValueIterator& operator++() {++mPos; return *this;}
-        ValueIterator operator++(int) {
-            auto tmp = *this;
-            ++(*this);
-            return tmp;
-        }
-    }; // Member class ValueIterator
-
-    ValueIterator  beginValue()          {return ValueIterator(this);}
-    ValueIterator cbeginValueAll() const {return ValueIterator(this);}
-
-    LeafNode(const Coord& ijk, const ValueType&, bool state)
-        : mOrigin(ijk & ~MASK)
-        , mValueMask(state) //invalid
-        , mDstOffset(0)
-    {
-    }
-    LeafNode(const LeafNode&) = delete; // disallow copy-construction
-    LeafNode(LeafNode&&) = delete; // disallow move construction
-    LeafNode& operator=(const LeafNode&) = delete; // disallow copy assignment
-    LeafNode& operator=(LeafNode&&) = delete; // disallow move assignment
-    ~LeafNode() = default;
-
-    const Mask<LOG2DIM>& valueMask() const {return mValueMask;}
-    const Mask<LOG2DIM>& getValueMask() const {return mValueMask;}
-    const Coord& origin() const {return mOrigin;}
-
-    /// @brief Return the linear offset corresponding to the given coordinate
-    static uint32_t CoordToOffset(const Coord& ijk)
-    {
-        return ((ijk[0] & int32_t(MASK)) << (2 * LOG2DIM)) +
-               ((ijk[1] & int32_t(MASK)) <<       LOG2DIM) +
-                (ijk[2] & int32_t(MASK));
-    }
-
-    static Coord OffsetToLocalCoord(uint32_t n)
-    {
-        NANOVDB_ASSERT(n < SIZE);
-        const int32_t m = n & ((1 << 2 * LOG2DIM) - 1);
-        return Coord(n >> 2 * LOG2DIM, m >> LOG2DIM, m & int32_t(MASK));
-    }
-
-    void localToGlobalCoord(Coord& ijk) const {ijk += mOrigin;}
-
-    Coord offsetToGlobalCoord(uint32_t n) const
-    {
-        Coord ijk = LeafNode::OffsetToLocalCoord(n);
-        this->localToGlobalCoord(ijk);
-        return ijk;
-    }
-
-    bool getFirstValue() const { return mValueMask.isOn(0); }
-    bool getLastValue() const { return mValueMask.isOn(SIZE - 1); }
-    bool getValue(uint32_t i) const {return mValueMask.isOn(i);}
-    bool getValue(const Coord& ijk) const {return mValueMask.isOn(CoordToOffset(ijk));}
-
-    template<typename OpT, typename... ArgsT>
-    auto get(const Coord& ijk, ArgsT&&... args) const {return OpT::get(*this, CoordToOffset(ijk), args...);}
-
-    template<typename OpT, typename... ArgsT>
-    auto set(const Coord& ijk, ArgsT&&... args) {return OpT::set(*this, CoordToOffset(ijk), args...);}
-
-#ifndef NANOVDB_NEW_ACCESSOR_METHODS
-    template<typename AccT>
-    bool getValueAndCache(const Coord& ijk, const AccT&) const
-    {
-        return mValueMask.isOn(CoordToOffset(ijk));
-    }
-
-    template<typename AccT>
-    void setValueAndCache(const Coord& ijk, bool, const AccT&)
-    {
-        const uint32_t n = CoordToOffset(ijk);
-        mValueMask.setOn(n);
-    }
-
-    template<typename AccT>
-    void setValueOnAndCache(const Coord& ijk, const AccT&)
-    {
-        const uint32_t n = CoordToOffset(ijk);
-        mValueMask.setOn(n);
-    }
-
-    template<typename AccT>
-    bool isActiveAndCache(const Coord& ijk, const AccT&) const
-    {
-        return mValueMask.isOn(CoordToOffset(ijk));
-    }
-#endif
-
-    void setValue(uint32_t n, bool) {mValueMask.setOn(n);}
-    void setValue(const Coord& ijk) {mValueMask.setOn(CoordToOffset(ijk));}
-
-    void merge(LeafNode &other)
-    {
-        mValueMask |= other.mValueMask;
-    }
-
-}; // build::LeafNode<ValueMask>
-
-//================================================================================================
-
-template <>
-struct LeafNode<bool>
-{
-    using ValueType = bool;
-    using BuildType = ValueMask;
-    using LeafNodeType = LeafNode<BuildType>;
-    static constexpr uint32_t LOG2DIM = 3;
-    static constexpr uint32_t TOTAL = LOG2DIM; // needed by parent nodes
-    static constexpr uint32_t DIM = 1u << TOTAL;
-    static constexpr uint32_t SIZE = 1u << 3 * LOG2DIM; // total number of voxels represented by this node
-    static constexpr uint32_t MASK = DIM - 1; // mask for bit operations
-    static constexpr uint32_t LEVEL = 0; // level 0 = leaf
-    static constexpr uint64_t NUM_VALUES = uint64_t(1) << (3 * TOTAL); // total voxel count represented by this node
-    using NodeMaskType = Mask<LOG2DIM>;
-    template<bool ON>
-    using MaskIterT = typename Mask<LOG2DIM>::template Iterator<ON>;
-    using NanoLeafT = typename NanoNode<BuildType, 0>::Type;
-
-    Coord         mOrigin;
-    Mask<LOG2DIM> mValueMask, mValues;
-    union {
-        NanoLeafT *mDstNode;
-        uint64_t   mDstOffset;
-    };
-
-    /// @brief Visits all active values in a leaf node
-    class ValueOnIterator : public MaskIterT<true>
-    {
-        using BaseT = MaskIterT<true>;
-        const LeafNode *mParent;
-    public:
-        ValueOnIterator() : BaseT(), mParent(nullptr) {}
-        ValueOnIterator(const LeafNode* parent) :  BaseT(parent->mValueMask.beginOn()), mParent(parent) {}
-        ValueOnIterator& operator=(const ValueOnIterator&) = default;
-        bool operator*() const {NANOVDB_ASSERT(*this); return mParent->mValues.isOn(BaseT::pos());}
-        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(BaseT::pos());}
-    }; // Member class ValueOnIterator
-
-    ValueOnIterator  beginValueOn()       {return ValueOnIterator(this);}
-    ValueOnIterator cbeginValueOn() const {return ValueOnIterator(this);}
-
-    /// @brief Visits all inactive values in a leaf node
-    class ValueOffIterator : public MaskIterT<false>
-    {
-        using BaseT = MaskIterT<false>;
-        const LeafNode *mParent;
-    public:
-        ValueOffIterator() : BaseT(), mParent(nullptr) {}
-        ValueOffIterator(const LeafNode* parent) :  BaseT(parent->mValueMask.beginOff()), mParent(parent) {}
-        ValueOffIterator& operator=(const ValueOffIterator&) = default;
-        bool operator*() const {NANOVDB_ASSERT(*this); return mParent->mValues.isOn(BaseT::pos());}
-        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(BaseT::pos());}
-    }; // Member class ValueOffIterator
-
-    ValueOffIterator  beginValueOff()       {return ValueOffIterator(this);}
-    ValueOffIterator cbeginValueOff() const {return ValueOffIterator(this);}
-
-    /// @brief Visits all values in a leaf node, i.e. both active and inactive values
-    class ValueIterator
-    {
-        const LeafNode *mParent;
-        uint32_t mPos;
-    public:
-        ValueIterator() : mParent(nullptr), mPos(1u << 3 * LOG2DIM) {}
-        ValueIterator(const LeafNode* parent) :  mParent(parent), mPos(0) {NANOVDB_ASSERT(parent);}
-        ValueIterator& operator=(const ValueIterator&) = default;
-        bool operator*() const { NANOVDB_ASSERT(*this); return mParent->mValues.isOn(mPos);}
-        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(mPos);}
-        bool isActive() const { NANOVDB_ASSERT(*this); return mParent->mValueMask.isOn(mPos);}
-        operator bool() const {return mPos < SIZE;}
-        ValueIterator& operator++() {++mPos; return *this;}
-        ValueIterator operator++(int) {
-            auto tmp = *this;
-            ++(*this);
-            return tmp;
-        }
-    }; // Member class ValueIterator
-
-    ValueIterator beginValue()           {return ValueIterator(this);}
-    ValueIterator cbeginValueAll() const {return ValueIterator(this);}
-
-    LeafNode(const Coord& ijk, bool value, bool state)
-        : mOrigin(ijk & ~MASK)
-        , mValueMask(state)
-        , mValues(value)
-        , mDstOffset(0)
-    {
-    }
-    LeafNode(const LeafNode&) = delete; // disallow copy-construction
-    LeafNode(LeafNode&&) = delete; // disallow move construction
-    LeafNode& operator=(const LeafNode&) = delete; // disallow copy assignment
-    LeafNode& operator=(LeafNode&&) = delete; // disallow move assignment
-    ~LeafNode() = default;
-
-    const Mask<LOG2DIM>& valueMask() const {return mValueMask;}
-    const Mask<LOG2DIM>& getValueMask() const {return mValueMask;}
-    const Coord& origin() const {return mOrigin;}
-
-    /// @brief Return the linear offset corresponding to the given coordinate
-    static uint32_t CoordToOffset(const Coord& ijk)
-    {
-        return ((ijk[0] & int32_t(MASK)) << (2 * LOG2DIM)) +
-               ((ijk[1] & int32_t(MASK)) << LOG2DIM) +
-                (ijk[2] & int32_t(MASK));
-    }
-
-    static Coord OffsetToLocalCoord(uint32_t n)
-    {
-        NANOVDB_ASSERT(n < SIZE);
-        const int32_t m = n & ((1 << 2 * LOG2DIM) - 1);
-        return Coord(n >> 2 * LOG2DIM, m >> LOG2DIM, m & int32_t(MASK));
-    }
-
-    void localToGlobalCoord(Coord& ijk) const
-    {
-        ijk += mOrigin;
-    }
-
-    Coord offsetToGlobalCoord(uint32_t n) const
-    {
-        Coord ijk = LeafNode::OffsetToLocalCoord(n);
-        this->localToGlobalCoord(ijk);
-        return ijk;
-    }
-    bool getFirstValue() const { return mValues.isOn(0); }
-    bool getLastValue() const { return mValues.isOn(SIZE - 1); }
-
-    bool getValue(uint32_t i) const {return mValues.isOn(i);}
-    bool getValue(const Coord& ijk) const
-    {
-        return mValues.isOn(CoordToOffset(ijk));
-    }
-#ifndef NANOVDB_NEW_ACCESSOR_METHODS
-    template<typename AccT>
-    bool isActiveAndCache(const Coord& ijk, const AccT&) const
-    {
-        return mValueMask.isOn(CoordToOffset(ijk));
-    }
-
-    template<typename AccT>
-    bool getValueAndCache(const Coord& ijk, const AccT&) const
-    {
-        return mValues.isOn(CoordToOffset(ijk));
-    }
-
-    template<typename AccT>
-    void setValueAndCache(const Coord& ijk, bool value, const AccT&)
-    {
-        const uint32_t n = CoordToOffset(ijk);
-        mValueMask.setOn(n);
-        mValues.setOn(n);
-    }
-
-    template<typename AccT>
-    void setValueOnAndCache(const Coord& ijk, const AccT&)
-    {
-        const uint32_t n = CoordToOffset(ijk);
-        mValueMask.setOn(n);
-    }
-#endif
-
-    void setValue(uint32_t n, bool value)
-    {
-        mValueMask.setOn(n);
-        mValues.set(n, value);
-    }
-    void setValue(const Coord& ijk, bool value) {return this->setValue(CoordToOffset(ijk), value);}
-
-    void merge(LeafNode &other)
-    {
-        mValues |= other.mValues;
-        mValueMask |= other.mValueMask;
-    }
-
-}; // build::LeafNode<bool>
-
-//================================================================================================
-
-template<typename BuildT>
-template<typename T>
-inline typename std::enable_if<std::is_floating_point<T>::value>::type
-LeafNode<BuildT>::signedFloodFill(T outside)
-{
-    const uint32_t first = *mValueMask.beginOn();
-    if (first < SIZE) {
-        bool xInside = mValues[first] < 0, yInside = xInside, zInside = xInside;
-        for (uint32_t x = 0; x != DIM; ++x) {
-            const uint32_t x00 = x << (2 * LOG2DIM);
-            if (mValueMask.isOn(x00))
-                xInside = mValues[x00] < 0; // element(x, 0, 0)
-            yInside = xInside;
-            for (uint32_t y = 0; y != DIM; ++y) {
-                const uint32_t xy0 = x00 + (y << LOG2DIM);
-                if (mValueMask.isOn(xy0))
-                    yInside = mValues[xy0] < 0; // element(x, y, 0)
-                zInside = yInside;
-                for (uint32_t z = 0; z != (1 << LOG2DIM); ++z) {
-                    const uint32_t xyz = xy0 + z; // element(x, y, z)
-                    if (mValueMask.isOn(xyz)) {
-                        zInside = mValues[xyz] < 0;
-                    } else {
-                        mValues[xyz] = zInside ? -outside : outside;
-                    }
-                }
-            }
-        }
-    }
-} // build::LeafNode<T>::signedFloodFill
-
-// ----------------------------> ValueAccessor <--------------------------------------
-
-template<typename BuildT>
-struct ValueAccessor
-{
-    using ValueType = typename BuildToValueMap<BuildT>::type;
-    using LeafT = build::LeafNode<BuildT>;
-    using Node1 = build::InternalNode<LeafT>;
-    using Node2 = build::InternalNode<Node1>;
-    using RootNodeType = build::RootNode<Node2>;
-    using LeafNodeType = typename RootNodeType::LeafNodeType;
-
-    ValueAccessor(RootNodeType& root)
-        : mRoot(root)
-        , mKeys{Coord(Maximum<int>::value()), Coord(Maximum<int>::value()), Coord(Maximum<int>::value())}
-        , mNode{nullptr, nullptr, nullptr}
-    {
-    }
-    ValueAccessor(ValueAccessor&&) = default; // allow move construction
-    ValueAccessor(const ValueAccessor&) = delete; // disallow copy construction
-    ValueType getValue(int i, int j, int k) const {return this->getValue(Coord(i,j,k));}
-    template<typename NodeT>
-    bool isCached(const Coord& ijk) const
-    {
-        return (ijk[0] & int32_t(~NodeT::MASK)) == mKeys[NodeT::LEVEL][0] &&
-               (ijk[1] & int32_t(~NodeT::MASK)) == mKeys[NodeT::LEVEL][1] &&
-               (ijk[2] & int32_t(~NodeT::MASK)) == mKeys[NodeT::LEVEL][2];
-    }
-
-    template <typename OpT, typename... ArgsT>
-    auto get(const Coord& ijk, ArgsT&&... args) const
-    {
-        if (this->template isCached<LeafT>(ijk)) {
-            return ((const LeafT*)mNode[0])->template get<OpT>(ijk, args...);
-        } else if (this->template isCached<Node1>(ijk)) {
-            return ((const Node1*)mNode[1])->template getAndCache<OpT>(ijk, *this, args...);
-        } else if (this->template isCached<Node2>(ijk)) {
-            return ((const Node2*)mNode[2])->template getAndCache<OpT>(ijk, *this, args...);
-        }
-        return mRoot.template getAndCache<OpT>(ijk, *this, args...);
-    }
-
-    template <typename OpT, typename... ArgsT>
-    auto set(const Coord& ijk, ArgsT&&... args) const
-    {
-        if (this->template isCached<LeafT>(ijk)) {
-            return ((LeafT*)mNode[0])->template set<OpT>(ijk, args...);
-        } else if (this->template isCached<Node1>(ijk)) {
-            return ((Node1*)mNode[1])->template setAndCache<OpT>(ijk, *this, args...);
-        } else if (this->template isCached<Node2>(ijk)) {
-            return ((Node2*)mNode[2])->template setAndCache<OpT>(ijk, *this, args...);
-        }
-        return mRoot.template setAndCache<OpT>(ijk, *this, args...);
-    }
-
-#ifdef NANOVDB_NEW_ACCESSOR_METHODS
-    ValueType getValue(const Coord& ijk) const {return this->template get<GetValue<BuildT>>(ijk);}
-    LeafT* setValue(const Coord& ijk, const ValueType& value) {return this->template set<SetValue<BuildT>>(ijk, value);}
-    LeafT* setValueOn(const Coord& ijk) {return this->template set<SetValue<BuildT>>(ijk);}
-    LeafT& touchLeaf(const Coord& ijk) {return this->template set<TouchLeaf<BuildT>>(ijk);}
-    bool isActive(const Coord& ijk) const {return this->template get<GetState<BuildT>>(ijk);}
-#else
-    ValueType getValue(const Coord& ijk) const
-    {
-        if (this->template isCached<LeafT>(ijk)) {
-            return ((LeafT*)mNode[0])->getValueAndCache(ijk, *this);
-        } else if (this->template isCached<Node1>(ijk)) {
-            return ((Node1*)mNode[1])->getValueAndCache(ijk, *this);
-        } else if (this->template isCached<Node2>(ijk)) {
-            return ((Node2*)mNode[2])->getValueAndCache(ijk, *this);
-        }
-        return mRoot.getValueAndCache(ijk, *this);
-    }
-
-    /// @brief Sets value in a leaf node and returns it.
-    LeafT* setValue(const Coord& ijk, const ValueType& value)
-    {
-        if (this->template isCached<LeafT>(ijk)) {
-            ((LeafT*)mNode[0])->setValueAndCache(ijk, value, *this);
-        } else if (this->template isCached<Node1>(ijk)) {
-            ((Node1*)mNode[1])->setValueAndCache(ijk, value, *this);
-        } else if (this->template isCached<Node2>(ijk)) {
-            ((Node2*)mNode[2])->setValueAndCache(ijk, value, *this);
-        } else {
-            mRoot.setValueAndCache(ijk, value, *this);
-        }
-        NANOVDB_ASSERT(this->isCached<LeafT>(ijk));
-        return (LeafT*)mNode[0];
-    }
-    void setValueOn(const Coord& ijk)
-    {
-        if (this->template isCached<LeafT>(ijk)) {
-            ((LeafT*)mNode[0])->setValueOnAndCache(ijk, *this);
-        } else if (this->template isCached<Node1>(ijk)) {
-            ((Node1*)mNode[1])->setValueOnAndCache(ijk, *this);
-        } else if (this->template isCached<Node2>(ijk)) {
-            ((Node2*)mNode[2])->setValueOnAndCache(ijk, *this);
-        } else {
-            mRoot.setValueOnAndCache(ijk, *this);
-        }
-    }
-    void touchLeaf(const Coord& ijk) const
-    {
-        if (this->template isCached<LeafT>(ijk)) {
-            return;
-        } else if (this->template isCached<Node1>(ijk)) {
-            ((Node1*)mNode[1])->touchLeafAndCache(ijk, *this);
-        } else if (this->template isCached<Node2>(ijk)) {
-            ((Node2*)mNode[2])->touchLeafAndCache(ijk, *this);
-        } else {
-            mRoot.touchLeafAndCache(ijk, *this);
-        }
-    }
-    bool isActive(const Coord& ijk) const
-    {
-        if (this->template isCached<LeafT>(ijk)) {
-            return ((LeafT*)mNode[0])->isActiveAndCache(ijk, *this);
-        } else if (this->template isCached<Node1>(ijk)) {
-            return ((Node1*)mNode[1])->isActiveAndCache(ijk, *this);
-        } else if (this->template isCached<Node2>(ijk)) {
-            return ((Node2*)mNode[2])->isActiveAndCache(ijk, *this);
-        }
-        return mRoot.isActiveAndCache(ijk, *this);
-    }
-#endif
-
-    bool isValueOn(const Coord& ijk) const { return this->isActive(ijk); }
-    template<typename NodeT>
-    void insert(const Coord& ijk, NodeT* node) const
-    {
-        mKeys[NodeT::LEVEL] = ijk & ~NodeT::MASK;
-        mNode[NodeT::LEVEL] = node;
-    }
-    RootNodeType& mRoot;
-    mutable Coord mKeys[3];
-    mutable void* mNode[3];
-}; // build::ValueAccessor<BuildT>
-
-// ----------------------------> Tree <--------------------------------------
-
-template<typename BuildT>
-struct Tree
-{
-    using ValueType = typename BuildToValueMap<BuildT>::type;
-    using Node0 = build::LeafNode<BuildT>;
-    using Node1 = build::InternalNode<Node0>;
-    using Node2 = build::InternalNode<Node1>;
-    using RootNodeType = build::RootNode<Node2>;
-    using LeafNodeType = typename RootNodeType::LeafNodeType;
-    struct WriteAccessor;
-
-    RootNodeType  mRoot;
-    std::mutex    mMutex;
-
-    Tree(const ValueType &background) : mRoot(background) {}
-    Tree(const Tree&) = delete; // disallow copy construction
-    Tree(Tree&&) = delete; // disallow move construction
-    Tree& tree() {return *this;}
-    RootNodeType& root() {return mRoot;}
-    ValueType getValue(const Coord& ijk) const {return mRoot.getValue(ijk);}
-    ValueType getValue(int i, int j, int k) const {return this->getValue(Coord(i,j,k));}
-    void setValue(const Coord& ijk, const ValueType &value) {mRoot.setValue(ijk, value);}
-    std::array<size_t,3> nodeCount() const
-    {
-        std::array<size_t, 3> count{0,0,0};
-        mRoot.nodeCount(count);
-        return count;
-    }
-    /// @brief regular accessor for thread-safe reading and non-thread-safe writing
-    ValueAccessor<BuildT> getAccessor() { return ValueAccessor<BuildT>(mRoot); }
-    /// @brief special accessor for thread-safe writing only
-    WriteAccessor getWriteAccessor() { return WriteAccessor(mRoot, mMutex); }
-};// build::Tree<BuildT>
-
-// ----------------------------> Tree::WriteAccessor <--------------------------------------
-
-template<typename BuildT>
-struct Tree<BuildT>::WriteAccessor
-{
-    using AccT   = ValueAccessor<BuildT>;
-    using ValueType = typename AccT::ValueType;
-    using LeafT  = typename AccT::LeafT;
-    using Node1  = typename AccT::Node1;
-    using Node2  = typename AccT::Node2;
-    using RootNodeType  = typename AccT::RootNodeType;
-
-    WriteAccessor(RootNodeType& parent, std::mutex &mx)
-        : mParent(parent)
-        , mRoot(parent.mBackground)
-        , mAcc(mRoot)
-        , mMutex(mx)
-    {
-    }
-    WriteAccessor(const WriteAccessor&) = delete; // disallow copy construction
-    WriteAccessor(WriteAccessor&&) = default; // allow move construction
-    ~WriteAccessor() { this->merge(); }
-    void merge()
-    {
-        mMutex.lock();
-        mParent.merge(mRoot);
-        mMutex.unlock();
-    }
-    inline void setValueOn(const Coord& ijk) {mAcc.setValueOn(ijk);}
-    inline void setValue(const Coord& ijk, const ValueType &value) {mAcc.setValue(ijk, value);}
-
-    RootNodeType &mParent, mRoot;
-    AccT          mAcc;
-    std::mutex   &mMutex;
-}; // build::Tree<BuildT>::WriteAccessor
-
-// ----------------------------> Grid <--------------------------------------
-
-template<typename BuildT>
-struct Grid : public Tree<BuildT>
-{
-    using BuildType = BuildT;
-    using ValueType = typename BuildToValueMap<BuildT>::type;
-    using TreeType = Tree<BuildT>;
-    using Node0 = build::LeafNode<BuildT>;
-    using Node1 = build::InternalNode<Node0>;
-    using Node2 = build::InternalNode<Node1>;
-    using RootNodeType = build::RootNode<Node2>;
-
-    GridClass   mGridClass;
-    GridType    mGridType;
-    Map         mMap;
-    std::string mName;
-
-    Grid(const ValueType &background, const std::string &name = "", GridClass gClass = GridClass::Unknown)
-      : TreeType(background)
-      , mGridClass(gClass)
-      , mGridType(mapToGridType<BuildT>())
-      , mName(name)
-    {
-        mMap.set(1.0, Vec3d(0.0), 1.0);
-    }
-    TreeType& tree() {return *this;}
-    const GridType&  gridType() const { return mGridType; }
-    const GridClass& gridClass() const { return mGridClass; }
-    const Map& map() const { return mMap; }
-    void setTransform(double scale=1.0, const Vec3d &translation = Vec3d(0.0)) {mMap.set(scale, translation, 1.0);}
-    const std::string& gridName() const { return mName; }
-    const std::string& getName() const { return mName; }
-    void setName(const std::string &name) { mName = name; }
-    /// @brief Sets grids values in domain of the @a bbox to those returned by the specified @a func with the
-    ///        expected signature [](const Coord&)->ValueType.
-    ///
-    /// @note If @a func returns a value equal to the background value of the input grid at a
-    ///       specific voxel coordinate, then the active state of that coordinate is off! Else the value
-    ///       value is set and the active state is on. This is done to allow for sparse grids to be generated.
-    ///
-    /// @param func  Functor used to evaluate the grid values in the @a bbox
-    /// @param bbox  Coordinate bounding-box over which the grid values will be set.
-    /// @param delta Specifies a lower threshold value for rendering (optional). Typically equals the voxel size
-    ///              for level sets and otherwise it's zero.
-    template <typename Func>
-    void operator()(const Func& func, const CoordBBox& bbox, ValueType delta = ValueType(0));
-};// build::Grid
-
-template <typename BuildT>
-template <typename Func>
-void Grid<BuildT>::operator()(const Func& func, const CoordBBox& bbox, ValueType delta)
-{
-    auto &root = this->tree().root();
-#if __cplusplus >= 201703L
-    static_assert(is_same<ValueType, typename std::invoke_result<Func,const Coord&>::type>::value, "GridBuilder: mismatched ValueType");
-#else// invoke_result was introduced in C++17 and result_of was removed in C++20
-    static_assert(is_same<ValueType, typename std::result_of<Func(const Coord&)>::type>::value, "GridBuilder: mismatched ValueType");
-#endif
-    const CoordBBox leafBBox(bbox[0] >> Node0::TOTAL, bbox[1] >> Node0::TOTAL);
-    std::mutex mutex;
-    forEach(leafBBox, [&](const CoordBBox& b) {
-        Node0* leaf = nullptr;
-        for (auto it = b.begin(); it; ++it) {
-            Coord min(*it << Node0::TOTAL), max(min + Coord(Node0::DIM - 1));
-            const CoordBBox b(min.maxComponent(bbox.min()),
-                              max.minComponent(bbox.max()));// crop
-            if (leaf == nullptr) {
-                leaf = new Node0(b[0], root.mBackground, false);
-            } else {
-                leaf->mOrigin = b[0] & ~Node0::MASK;
-                NANOVDB_ASSERT(leaf->mValueMask.isOff());
-            }
-            leaf->mDstOffset = 0;// no prune
-            for (auto ijk = b.begin(); ijk; ++ijk) {
-                const auto v = func(*ijk);// call functor
-                if (v != root.mBackground) leaf->setValue(*ijk, v);// don't insert background values
-            }
-            if (!leaf->mValueMask.isOff()) {// has active values
-                if (leaf->mValueMask.isOn()) {// only active values
-                    const auto first = leaf->getFirstValue();
-                    int n=1;
-                    while (n<512) {// 8^3 = 512
-                        if (leaf->mValues[n++] != first) break;
-                    }
-                    if (n == 512) leaf->mDstOffset = 1;// prune below
-                }
-                std::lock_guard<std::mutex> guard(mutex);
-                NANOVDB_ASSERT(leaf != nullptr);
-                root.addNode(leaf);
-                NANOVDB_ASSERT(leaf == nullptr);
-            }
-        }// loop over sub-part of leafBBox
-        if (leaf) delete leaf;
-    });
-
-    // Prune leaf and tile nodes
-    for (auto it2 = root.mTable.begin(); it2 != root.mTable.end(); ++it2) {
-        if (auto *upper = it2->second.child) {//upper level internal node
-            for (auto it1 = upper->mChildMask.beginOn(); it1; ++it1) {
-                auto *lower = upper->mTable[*it1].child;// lower level internal node
-                for (auto it0 = lower->mChildMask.beginOn(); it0; ++it0) {
-                    auto *leaf = lower->mTable[*it0].child;// leaf nodes
-                    if (leaf->mDstOffset) {
-                        lower->mTable[*it0].value = leaf->getFirstValue();
-                        lower->mChildMask.setOff(*it0);
-                        lower->mValueMask.setOn(*it0);
-                        delete leaf;
-                    }
-                }// loop over leaf nodes
-                if (lower->mChildMask.isOff()) {//only tiles
-                    const auto first = lower->getFirstValue();
-                    int n=1;
-                    while (n < 4096) {// 16^3 = 4096
-                        if (lower->mTable[n++].value != first) break;
-                    }
-                    if (n == 4096) {// identical tile values so prune
-                        upper->mTable[*it1].value = first;
-                        upper->mChildMask.setOff(*it1);
-                        upper->mValueMask.setOn(*it1);
-                        delete lower;
-                    }
-                }
-            }// loop over lower internal nodes
-            if (upper->mChildMask.isOff()) {//only tiles
-                const auto first = upper->getFirstValue();
-                int n=1;
-                while (n < 32768) {// 32^3 = 32768
-                    if (upper->mTable[n++].value != first) break;
-                }
-                if (n == 32768) {// identical tile values so prune
-                    it2->second.value = first;
-                    it2->second.state = upper->mValueMask.isOn();
-                    it2->second.child = nullptr;
-                    delete upper;
-                }
-            }
-        }// is child node of the root
-    }// loop over root table
-}// build::Grid::operator()
-
-//================================================================================================
-
-template <typename T>
-using BuildLeaf = LeafNode<T>;
-template <typename T>
-using BuildLower = InternalNode<BuildLeaf<T>>;
-template <typename T>
-using BuildUpper = InternalNode<BuildLower<T>>;
-template <typename T>
-using BuildRoot  = RootNode<BuildUpper<T>>;
-template <typename T>
-using BuildTile  = typename BuildRoot<T>::Tile;
-
-using FloatGrid  = Grid<float>;
-using Fp4Grid    = Grid<Fp4>;
-using Fp8Grid    = Grid<Fp8>;
-using Fp16Grid   = Grid<Fp16>;
-using FpNGrid    = Grid<FpN>;
-using DoubleGrid = Grid<double>;
-using Int32Grid  = Grid<int32_t>;
-using UInt32Grid = Grid<uint32_t>;
-using Int64Grid  = Grid<int64_t>;
-using Vec3fGrid  = Grid<Vec3f>;
-using Vec3dGrid  = Grid<Vec3d>;
-using Vec4fGrid  = Grid<Vec4f>;
-using Vec4dGrid  = Grid<Vec4d>;
-using MaskGrid   = Grid<ValueMask>;
-using IndexGrid  = Grid<ValueIndex>;
-using OnIndexGrid = Grid<ValueOnIndex>;
-using BoolGrid   = Grid<bool>;
-
-// ----------------------------> NodeManager <--------------------------------------
-
-// GridT can be openvdb::Grid and nanovdb::build::Grid
-template <typename GridT>
-class NodeManager
-{
-public:
-
-    using ValueType = typename GridT::ValueType;
-    using BuildType = typename GridT::BuildType;
-    using GridType = GridT;
-    using TreeType = typename GridT::TreeType;
-    using RootNodeType = typename TreeType::RootNodeType;
-    static_assert(RootNodeType::LEVEL == 3, "NodeManager expected LEVEL=3");
-    using Node2 = typename RootNodeType::ChildNodeType;
-    using Node1 = typename Node2::ChildNodeType;
-    using Node0 = typename Node1::ChildNodeType;
-
-    NodeManager(GridT &grid) : mGrid(grid) {this->init();}
-    void init()
-    {
-        mArray0.clear();
-        mArray1.clear();
-        mArray2.clear();
-        auto counts = mGrid.tree().nodeCount();
-        mArray0.reserve(counts[0]);
-        mArray1.reserve(counts[1]);
-        mArray2.reserve(counts[2]);
-
-        for (auto it2 = mGrid.tree().root().cbeginChildOn(); it2; ++it2) {
-            Node2 &upper = const_cast<Node2&>(*it2);
-            mArray2.emplace_back(&upper);
-            for (auto it1 = upper.cbeginChildOn(); it1; ++it1) {
-                Node1 &lower = const_cast<Node1&>(*it1);
-                mArray1.emplace_back(&lower);
-                for (auto it0 = lower.cbeginChildOn(); it0; ++it0) {
-                    Node0 &leaf = const_cast<Node0&>(*it0);
-                    mArray0.emplace_back(&leaf);
-                }// loop over leaf nodes
-            }// loop over lower internal nodes
-        }// loop over root node
-    }
-
-    /// @brief Return the number of tree nodes at the specified level
-    /// @details 0 is leaf, 1 is lower internal, and 2 is upper internal level
-    uint64_t nodeCount(int level) const
-    {
-        NANOVDB_ASSERT(level==0 || level==1 || level==2);
-        return level==0 ? mArray0.size() : level==1 ? mArray1.size() : mArray2.size();
-    }
-
-    template <int LEVEL>
-    typename enable_if<LEVEL==0, Node0&>::type node(int i) {return *mArray0[i];}
-    template <int LEVEL>
-    typename enable_if<LEVEL==0, const Node0&>::type node(int i) const {return *mArray0[i];}
-    template <int LEVEL>
-    typename enable_if<LEVEL==1, Node1&>::type node(int i) {return *mArray1[i];}
-    template <int LEVEL>
-    typename enable_if<LEVEL==1, const Node1&>::type node(int i) const {return *mArray1[i];}
-    template <int LEVEL>
-    typename enable_if<LEVEL==2, Node2&>::type node(int i) {return *mArray2[i];}
-    template <int LEVEL>
-    typename enable_if<LEVEL==2, const Node2&>::type node(int i) const {return *mArray2[i];}
-
-    /// @brief Return the i'th leaf node with respect to breadth-first ordering
-    const Node0& leaf(uint32_t i) const { return *mArray0[i]; }
-    Node0& leaf(uint32_t i) { return *mArray0[i]; }
-    uint64_t leafCount() const {return mArray0.size();}
-
-    /// @brief Return the i'th lower internal node with respect to breadth-first ordering
-    const Node1& lower(uint32_t i) const { return *mArray1[i]; }
-    Node1& lower(uint32_t i) { return *mArray1[i]; }
-    uint64_t lowerCount() const {return mArray1.size();}
-
-    /// @brief Return the i'th upper internal node with respect to breadth-first ordering
-    const Node2& upper(uint32_t i) const { return *mArray2[i]; }
-    Node2& upper(uint32_t i) { return *mArray2[i]; }
-    uint64_t upperCount() const {return mArray2.size();}
-
-    RootNodeType& root() {return mGrid.tree().root();}
-    const RootNodeType& root() const {return mGrid.tree().root();}
-
-    TreeType& tree() {return mGrid.tree();}
-    const TreeType& tree() const {return mGrid.tree();}
-
-    GridType& grid() {return mGrid;}
-    const GridType& grid() const {return mGrid;}
-
-protected:
-
-    GridT                &mGrid;
-    std::vector<Node0*>   mArray0; // leaf nodes
-    std::vector<Node1*>   mArray1; // lower internal nodes
-    std::vector<Node2*>   mArray2; // upper internal nodes
-
-};// NodeManager
-
-template <typename NodeManagerT>
-typename enable_if<is_floating_point<typename NodeManagerT::ValueType>::value>::type
-sdfToLevelSet(NodeManagerT &mgr)
-{
-    mgr.grid().mGridClass = GridClass::LevelSet;
-    // Note that the bottom-up flood filling is essential
-    const auto outside = mgr.root().mBackground;
-    forEach(0, mgr.leafCount(), 8, [&](const Range1D& r) {
-        for (auto i = r.begin(); i != r.end(); ++i) mgr.leaf(i).signedFloodFill(outside);
-    });
-    forEach(0, mgr.lowerCount(), 1, [&](const Range1D& r) {
-        for (auto i = r.begin(); i != r.end(); ++i) mgr.lower(i).signedFloodFill(outside);
-    });
-    forEach(0, mgr.upperCount(), 1, [&](const Range1D& r) {
-        for (auto i = r.begin(); i != r.end(); ++i) mgr.upper(i).signedFloodFill(outside);
-    });
-    mgr.root().signedFloodFill(outside);
-}// sdfToLevelSet
-
-template <typename NodeManagerT>
-void levelSetToFog(NodeManagerT &mgr, bool rebuild = true)
-{
-    using ValueType = typename NodeManagerT::ValueType;
-    mgr.grid().mGridClass = GridClass::FogVolume;
-    const ValueType d = -mgr.root().mBackground, w = 1.0f / d;
-    std::atomic_bool prune{false};
-    auto op = [&](ValueType& v) -> bool {
-        if (v > ValueType(0)) {
-            v = ValueType(0);
-            return false;
-        }
-        v = v > d ? v * w : ValueType(1);
-        return true;
-    };
-    forEach(0, mgr.leafCount(), 8, [&](const Range1D& r) {
-        for (auto i = r.begin(); i != r.end(); ++i) {
-            auto& leaf = mgr.leaf(i);
-            for (uint32_t i = 0; i < 512u; ++i) leaf.mValueMask.set(i, op(leaf.mValues[i]));
-        }
-    });
-    forEach(0, mgr.lowerCount(), 1, [&](const Range1D& r) {
-        for (auto i = r.begin(); i != r.end(); ++i) {
-            auto& node = mgr.lower(i);
-            for (uint32_t i = 0; i < 4096u; ++i) {
-                if (node.mChildMask.isOn(i)) {
-                    auto* leaf = node.mTable[i].child;
-                    if (leaf->mValueMask.isOff()) {// prune leaf node
-                        node.mTable[i].value = leaf->getFirstValue();
-                        node.mChildMask.setOff(i);
-                        delete leaf;
-                        prune = true;
-                    }
-                } else {
-                    node.mValueMask.set(i, op(node.mTable[i].value));
-                }
-            }
-        }
-    });
-    forEach(0, mgr.upperCount(), 1, [&](const Range1D& r) {
-        for (auto i = r.begin(); i != r.end(); ++i) {
-            auto& node = mgr.upper(i);
-            for (uint32_t i = 0; i < 32768u; ++i) {
-                if (node.mChildMask.isOn(i)) {// prune lower internal node
-                    auto* child = node.mTable[i].child;
-                    if (child->mChildMask.isOff() && child->mValueMask.isOff()) {
-                        node.mTable[i].value = child->getFirstValue();
-                        node.mChildMask.setOff(i);
-                        delete child;
-                        prune = true;
-                    }
-                } else {
-                    node.mValueMask.set(i, op(node.mTable[i].value));
-                }
-            }
-        }
-    });
-
-    for (auto it = mgr.root().mTable.begin(); it != mgr.root().mTable.end(); ++it) {
-        auto* child = it->second.child;
-        if (child == nullptr) {
-            it->second.state = op(it->second.value);
-        } else if (child->mChildMask.isOff() && child->mValueMask.isOff()) {
-            it->second.value = child->getFirstValue();
-            it->second.state = false;
-            it->second.child = nullptr;
-            delete child;
-            prune = true;
-        }
-    }
-    if (rebuild && prune) mgr.init();
-}// levelSetToFog
-
-// ----------------------------> Implementations of random access methods <--------------------------------------
-
-template <typename T>
-struct TouchLeaf {
-    static BuildLeaf<T>& set(BuildLeaf<T> &leaf, uint32_t)  {return leaf;}
-};// TouchLeaf<BuildT>
-
-/// @brief Implements Tree::getValue(Coord), i.e. return the value associated with a specific coordinate @c ijk.
-/// @tparam BuildT Build type of the grid being called
-/// @details The value at a coordinate maps to the background, a tile value or a leaf value.
-template <typename T>
-struct GetValue {
-    static auto get(const BuildRoot<T>  &root) {return root.mBackground;}
-    static auto get(const BuildTile<T>  &tile) {return tile.value;}
-    static auto get(const BuildUpper<T> &node, uint32_t n) {return node.mTable[n].value;}
-    static auto get(const BuildLower<T> &node, uint32_t n) {return node.mTable[n].value;}
-    static auto get(const BuildLeaf<T>  &leaf, uint32_t n) {return leaf.getValue(n);}
-};// GetValue<T>
-
-/// @brief Implements Tree::isActive(Coord)
-/// @tparam T Build type of the grid being called
-template <typename T>
-struct GetState {
-    static bool get(const BuildRoot<T>&) {return false;}
-    static bool get(const BuildTile<T>  &tile) {return tile.state;}
-    static bool get(const BuildUpper<T> &node, uint32_t n) {return node.mValueMask.isOn(n);}
-    static bool get(const BuildLower<T> &node, uint32_t n) {return node.mValueMask.isOn(n);}
-    static bool get(const BuildLeaf<T>  &leaf, uint32_t n) {return leaf.mValueMask.isOn(n);}
-};// GetState<T>
-
-/// @brief Set the value and its state at the leaf level mapped to by ijk, and create the leaf node and branch if needed.
-/// @tparam T BuildType of the corresponding tree
-template <typename T>
-struct SetValue {
-    static BuildLeaf<T>* set(BuildLeaf<T> &leaf, uint32_t n) {
-        leaf.mValueMask.setOn(n);// always set the active bit
-        return &leaf;
-    }
-    static BuildLeaf<T>* set(BuildLeaf<T> &leaf, uint32_t n, const typename BuildLeaf<T>::ValueType &v) {
-        leaf.setValue(n, v);
-        return &leaf;
-    }
-};// SetValue<T>
-
-/// @brief Implements Tree::probeLeaf(Coord)
-/// @tparam T Build type of the grid being called
-template <typename T>
-struct ProbeValue {
-    using ValueT = typename BuildLeaf<T>::ValueType;
-    static bool get(const BuildRoot<T>  &root, ValueT &v) {
-        v = root.mBackground;
-        return false;
-    }
-    static bool get(const BuildTile<T> &tile, ValueT &v) {
-        v = tile.value;
-        return tile.state;
-    }
-    static bool get(const BuildUpper<T> &node, uint32_t n, ValueT &v) {
-        v = node.mTable[n].value;
-        return node.mValueMask.isOn(n);
-    }
-    static bool get(const BuildLower<T> &node, uint32_t n, ValueT &v) {
-        v = node.mTable[n].value;
-        return node.mValueMask.isOn(n);
-    }
-    static bool get(const BuildLeaf<T>  &leaf, uint32_t n, ValueT &v) {
-        v = leaf.getValue(n);
-        return leaf.isActive(n);
-    }
-};// ProbeValue<T>
-
-} // namespace build
-
-} // namespace nanovdb
-
-#endif // NANOVDB_GRID_BUILDER_H_HAS_BEEN_INCLUDED
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/GridBuilder.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/tools/GridBuilder.h instead.")
diff --git a/nanovdb/nanovdb/util/GridChecksum.h b/nanovdb/nanovdb/util/GridChecksum.h
index 531a6f674b..1b0075f9c0 100644
--- a/nanovdb/nanovdb/util/GridChecksum.h
+++ b/nanovdb/nanovdb/util/GridChecksum.h
@@ -1,462 +1,6 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-/*!
-    \file GridChecksum.h
-
-    \author Ken Museth
-
-    \brief Computes a pair of 32bit checksums, of a Grid, by means of Cyclic Redundancy Check (CRC)
-
-    \details A CRC32 is the 32 bit remainder, or residue, of binary division of a message, by a polynomial.
-*/
-
-#ifndef NANOVDB_GRIDCHECKSUM_H_HAS_BEEN_INCLUDED
-#define NANOVDB_GRIDCHECKSUM_H_HAS_BEEN_INCLUDED
-
-#include <algorithm>// for std::generate
-#include <array>
-#include <vector>
-#include <cstdint>
-#include <cstddef>// offsetof macro
-#include <numeric>
-#include <type_traits>
-#include <memory>// for std::unique_ptr
-
-#include <nanovdb/NanoVDB.h>
-#include "ForEach.h"
-#include "NodeManager.h"
-
-// Define log of block size for FULL CRC32 computation.
-// A value of 12 corresponds to a block size of 4KB (2^12 = 4096).
-// Undefine to use old checksum computation
-#define NANOVDB_CRC32_LOG2_BLOCK_SIZE 12
-
-namespace nanovdb {
-
-/// @brief List of different modes for computing for a checksum
-enum class ChecksumMode : uint32_t { Disable = 0,// no computation
-                                     Partial = 1,// fast but approximate
-                                     Full = 2,// slow but accurate
-                                     Default = 1,// defaults to Partial
-                                     End = 3 };// marks the end of the enum list
-
-/// @brief  Return the (2 x CRC32) checksum of the specified @a grid
-/// @tparam BuildT Template parameter used to build NanoVDB grid.
-/// @param grid Grid from which the checksum is computed.
-/// @param mode Defines the mode of computation for the checksum.
-/// @return Return the (2 x CRC32) checksum of the specified @a grid
-template <typename BuildT>
-uint64_t checksum(const NanoGrid<BuildT> &grid, ChecksumMode mode = ChecksumMode::Default);
-
-/// @brief Return true if the checksum of the @a grid matches the expected
-///        value already encoded into the grid's meta data.
-/// @tparam BuildT Template parameter used to build NanoVDB grid.
-/// @param grid Grid whose checksum is validated.
-/// @param mode Defines the mode of computation for the checksum.
-template <typename BuildT>
-bool validateChecksum(const NanoGrid<BuildT> &grid, ChecksumMode mode = ChecksumMode::Default);
-
-/// @brief Updates the checksum of a grid
-///
-/// @param grid Grid whose checksum will be updated.
-/// @param mode Defines the mode of computation for the checksum.
-template <typename BuildT>
-void updateChecksum(NanoGrid<BuildT> &grid, ChecksumMode mode = ChecksumMode::Default);
-
-namespace crc32 {
-
-/// @brief Initiate single entry in look-up-table for CRC32 computations
-/// @param lut pointer of size 256 for look-up-table
-/// @param n entry in table (assumed n < 256)
-inline __hostdev__ void initLut(uint32_t lut[256], uint32_t n)
-{
-    uint32_t &cs = lut[n] = n;
-    for (int i = 0; i < 8; ++i) cs = (cs >> 1) ^ ((cs & 1) ? 0xEDB88320 : 0);
-}
-
-/// @brief Initiate entire look-up-table for CRC32 computations
-/// @param lut pointer of size 256 for look-up-table
-inline __hostdev__ void initLut(uint32_t lut[256]){for (uint32_t n = 0u; n < 256u; ++n) initLut(lut, n);}
-
-/// @brief Create and initiate entire look-up-table for CRC32 computations
-/// @return returns a unique pointer to the lookup table of size 256.
-inline std::unique_ptr<uint32_t[]> createLut()
-{
-    std::unique_ptr<uint32_t[]> lut(new uint32_t[256]);
-    initLut(lut.get());
-    return lut;
-}
-
-/// @brief Compute crc32 checksum of @c data of @c size bytes (without a lookup table))
-/// @param data pointer to beginning of data
-/// @param size byte size of data
-/// @param crc initial value of crc32 checksum
-/// @return return crc32 checksum of @c data
-inline __hostdev__ uint32_t checksum(const void* data, size_t size, uint32_t crc = 0)
-{
-    crc = ~crc;
-    for (auto *p = (const uint8_t*)data, *q = p + size; p != q; ++p) {
-        crc ^= *p;
-        for (int j = 0; j < 8; ++j) crc = (crc >> 1) ^ (0xEDB88320 & (-(crc & 1)));
-    }
-    return ~crc;
-}
-
-/// @brief Compute crc32 checksum of data between @c begin and @c end
-/// @param begin points to beginning of data
-/// @param end points to end of @data, (exclusive)
-/// @param crc initial value of crc32 checksum
-/// @return return crc32 checksum
-inline __hostdev__ uint32_t checksum(const void *begin, const void *end, uint32_t crc = 0)
-{
-    NANOVDB_ASSERT(begin && end);
-    NANOVDB_ASSERT(end >= begin);
-    return checksum(begin, (const char*)end - (const char*)begin, crc);
-}
-
-/// @brief Compute crc32 checksum of @c data with @c size bytes using a lookup table
-/// @param data pointer to begenning of data
-/// @param size byte size
-/// @param lut pointer to loopup table for accelerated crc32 computation
-/// @param crc initial value of the checksum
-/// @return crc32 checksum of @c data with @c size bytes
-inline __hostdev__ uint32_t checksum(const void *data, size_t size, const uint32_t lut[256], uint32_t crc = 0)
-{
-    crc = ~crc;
-    for (auto *p = (const uint8_t*)data, *q = p + size; p != q; ++p) crc = lut[(crc ^ *p) & 0xFF] ^ (crc >> 8);
-    return ~crc;
-}
-
-/// @brief Compute crc32 checksum of data between @c begin and @c end using a lookup table
-/// @param begin points to beginning of data
-/// @param end points to end of @data, (exclusive)
-/// @param lut pointer to loopup table for accelerated crc32 computation
-/// @param crc initial value of crc32 checksum
-/// @return return crc32 checksum
-inline __hostdev__ uint32_t checksum(const void *begin, const void *end, const uint32_t lut[256], uint32_t crc = 0)
-{
-    NANOVDB_ASSERT(begin && end);
-    NANOVDB_ASSERT(end >= begin);
-    return checksum(begin, (const char*)end - (const char*)begin, lut, crc);
-}
-
-}// namespace crc32
-
-/// @brief Class that encapsulates two CRC32 checksums, one for the Grid, Tree and Root node meta data
-///        and one for the remaining grid nodes.
-class GridChecksum
-{
-    /// Three types of checksums:
-    ///   1) Empty: all 64 bits are on (used to signify no checksum)
-    ///   2) Partial: Upper 32 bits are on and not all of lower 32 bits are on (lower 32 bits checksum head of grid)
-    ///   3) Full: Not all of the 64 bits are one (lower 32 bits checksum head of grid and upper 32 bits checksum tail of grid)
-    union {uint32_t mCRC[2]; uint64_t mChecksum; };// mCRC[0] is checksum of Grid, Tree and Root, and mCRC[1] is checksum of nodes
-    static constexpr uint32_t EMPTY32 = ~uint32_t{0};
-
-public:
-
-    static constexpr uint64_t EMPTY = ~uint64_t(0);
-
-    /// @brief default constructor initiates checksum to EMPTY
-    GridChecksum() : mCRC{EMPTY32, EMPTY32} {}
-
-    /// @brief Constructor that allows the two 32bit checksums to be initiated explicitly
-    /// @param head Initial 32bit CRC checksum of grid, tree and root data
-    /// @param tail Initial 32bit CRC checksum of all the nodes and blind data
-    GridChecksum(uint32_t head, uint32_t tail) :  mCRC{head, tail} {}
-
-    /// @brief
-    /// @param checksum
-    /// @param mode
-    GridChecksum(uint64_t checksum, ChecksumMode mode = ChecksumMode::Full) : mChecksum{mode == ChecksumMode::Disable ? EMPTY : checksum}
-    {
-        if (mode == ChecksumMode::Partial) mCRC[1] = EMPTY32;
-    }
-
-    /// @brief return the 64 bit checksum of this instance
-    uint64_t checksum() const { return mChecksum; }
-
-    /// @brief return 32 bit (crc32) checksum of this instance
-    /// @param i index of value 0 or 1 indicated the 32 bit checksum of the head or nodes
-    /// @return non-const reference of the i'th 32bit checksum
-    uint32_t& checksum(int i) {NANOVDB_ASSERT(i==0 || i==1); return mCRC[i]; }
-
-    /// @brief return 32 bit (crc32) checksum of this instance
-    /// @param i index of value 0 or 1 indicated the 32 bit checksum of the head or nodes
-    /// @return copy of the i'th 32bit checksum
-    uint32_t checksum(int i) const {NANOVDB_ASSERT(i==0 || i==1); return mCRC[i]; }
-
-    /// @brief return true if the 64 bit checksum is partial, i.e. of head only
-    bool isPartial() const { return mCRC[0] != EMPTY32 && mCRC[1] == EMPTY32; }
-
-    /// @brief return true if the 64 bit checksum is fill, i.e. of both had and nodes
-    bool isFull() const { return mCRC[0] != EMPTY32 && mCRC[1] != EMPTY32; }
-
-    /// @brief return true if the 64 bit checksum is disables (unset)
-    bool isEmpty() const { return mChecksum == EMPTY; }
-
-    /// @brief return the mode of the 64 bit checksum
-    ChecksumMode mode() const
-    {
-        return mChecksum == EMPTY ? ChecksumMode::Disable :
-               mCRC[1] == EMPTY32 ? ChecksumMode::Partial : ChecksumMode::Full;
-    }
-#ifdef NANOVDB_CRC32_LOG2_BLOCK_SIZE
-    /// @brief compute checksum of @c gridData using a 4KB blocked approach
-    /// @param gridData Reference to GridData
-    /// @param mode Mode of the checksum computation
-    ChecksumMode operator()(const GridData &gridData, ChecksumMode mode = ChecksumMode::Full);
-#else
-    /// @brief Compute checksum using old (node-based) approach
-    /// @tparam ValueT Build type of the grid
-    /// @param grid Reference to Grid
-    /// @param mode Mode of the checksum computation
-    template <typename ValueT>
-    void operator()(const NanoGrid<ValueT> &grid, ChecksumMode mode = ChecksumMode::Full);
-#endif
-    /// @brief return true if the checksums are identical
-    /// @param rhs other GridChecksum
-    bool operator==(const GridChecksum &rhs) const {return mChecksum == rhs.mChecksum;}
-
-    /// @brief return true if the checksums are not identical
-    /// @param rhs other GridChecksum
-    bool operator!=(const GridChecksum &rhs) const {return mChecksum != rhs.mChecksum;}
-};// GridChecksum
-
-// [GridData][TreeData]---[RootData][ROOT TILES...]---[NodeData<5>]---[NodeData<4>]---[LeafData<3>]---[BLINDMETA...]---[BLIND0]---[BLIND1]---etc.
-
-#ifdef NANOVDB_CRC32_LOG2_BLOCK_SIZE
-
-inline ChecksumMode GridChecksum::operator()(const GridData &gridData, ChecksumMode mode)
-{
-    mChecksum = EMPTY;
-
-    if (mode == ChecksumMode::Disable) return ChecksumMode::Disable;
-
-    auto lut = crc32::createLut();
-    const uint8_t *begin = (const uint8_t*)(&gridData), *mid = gridData.template nodePtr<2>(), *end = begin + gridData.mGridSize;// what about empty grids?
-    if (mid == nullptr) {// no (upper) nodes
-        if (gridData.mBlindMetadataCount) {
-            mid = begin + gridData.mBlindMetadataOffset;// exclude blind data from Partial checksum
-        } else {
-            mid = end;// no nodes or blind data, so Partial checksum is computed on the entire grid buffer
-        }
-    }
-    mCRC[0] = crc32::checksum(begin + 16, mid, lut.get());// GridData, TreeData. RootData but exclude GridData::mMagic and GridData::mChecksum
-
-    if (mode != ChecksumMode::Full || mid == end) return ChecksumMode::Partial;
-
-    uint64_t size = end - mid;// includes blind data
-    const uint64_t blockCount = size >> NANOVDB_CRC32_LOG2_BLOCK_SIZE;// number of 4 KB (4096 byte) blocks
-    std::unique_ptr<uint32_t[]> checksums(new uint32_t[blockCount]);
-    forEach(0, blockCount, 64, [&](const Range1D &r) {
-        uint32_t blockSize = 1 << NANOVDB_CRC32_LOG2_BLOCK_SIZE;
-        uint32_t *p = checksums.get() + r.begin();
-        for (auto i = r.begin(); i != r.end(); ++i) {
-            if (i+1 == blockCount) blockSize += size - (blockCount<<NANOVDB_CRC32_LOG2_BLOCK_SIZE);
-            *p++ = crc32::checksum(mid + (i<<NANOVDB_CRC32_LOG2_BLOCK_SIZE), blockSize, lut.get());
-        }
-    });
-    mCRC[1] = crc32::checksum(checksums.get(), sizeof(uint32_t)*blockCount, lut.get());
-
-    return ChecksumMode::Full;
-}// GridChecksum::operator(const GridData&, ChecksumMode)
-
-#else// NANOVDB_CRC32_LOG2_BLOCK_SIZE
-
-template <typename ValueT>
-void GridChecksum::operator()(const NanoGrid<ValueT> &grid, ChecksumMode mode)
-{
-    // Validate the assumed memory layout
-    static_assert(offsetof(GridData, mMagic)    ==  0, "Unexpected offset to magic number");
-    static_assert(offsetof(GridData, mChecksum) ==  8, "Unexpected offset to checksum");
-    static_assert(offsetof(GridData, mVersion)  == 16, "Unexpected offset to version number");
-
-    mChecksum = EMPTY;
-
-    if (mode == ChecksumMode::Disable) return;
-
-    auto lut = crc32::createLut();
-    const uint8_t *begin = reinterpret_cast<const uint8_t*>(&grid), *mid = grid.template nodePtr<2>();
-
-    mCRC[0] = crc32::checksum(begin + 16, mid, lut.get());// process Grid + Tree + Root but exclude mMagic and mChecksum
-
-    if (mode != ChecksumMode::Full || grid.isEmpty()) return;
-
-    const auto &tree = grid.tree();
-    const auto &root = tree.root();
-    auto nodeMgrHandle = createNodeManager(grid);
-    auto *nodeMgr = nodeMgrHandle.template mgr<ValueT>();
-    assert(isValid(nodeMgr));
-    const auto nodeCount = tree.nodeCount(0) + tree.nodeCount(1) + tree.nodeCount(2);
-    std::vector<uint32_t> checksums(nodeCount, 0);
-    // process upper internal nodes
-    auto kernel2 = [&](const Range1D &r) {
-        uint32_t *p = checksums.data() + r.begin();
-        for (auto i = r.begin(); i != r.end(); ++i) {
-            const auto &node = nodeMgr->upper(static_cast<uint32_t>(i));
-            *p++ = crc32::checksum(&node, node.memUsage(), lut.get());
-        }
-    };
-    // process lower internal nodes
-    auto kernel1 = [&](const Range1D &r) {
-        uint32_t *p = checksums.data() + r.begin() + tree.nodeCount(2);
-        for (auto i = r.begin(); i != r.end(); ++i) {
-            const auto &node = nodeMgr->lower(static_cast<uint32_t>(i));
-            *p++ = crc32::checksum(&node, node.memUsage(), lut.get());
-        }
-    };
-    // process leaf nodes
-    auto kernel0 = [&](const Range1D &r) {
-        uint32_t *p = checksums.data() + r.begin() + tree.nodeCount(1) + tree.nodeCount(2);
-        for (auto i = r.begin(); i != r.end(); ++i) {
-            const auto &leaf = nodeMgr->leaf(static_cast<uint32_t>(i));
-            *p++ = crc32::checksum(&leaf, leaf.memUsage(), lut.get());
-        }
-    };
-    forEach(0, tree.nodeCount(2), 1, kernel2);
-    forEach(0, tree.nodeCount(1), 1, kernel1);
-    forEach(0, tree.nodeCount(0), 8, kernel0);
-    mCRC[1] = crc32::checksum(checksums.data(), sizeof(uint32_t)*checksums.size(), lut.get());
-}// GridChecksum::operator()
-
-#endif// NANOVDB_CRC32_LOG2_BLOCK_SIZE
-
-template <typename ValueT>
-uint64_t checksum(const NanoGrid<ValueT> &grid, ChecksumMode mode)
-{
-    GridChecksum cs;
-    cs(grid, mode);
-    return cs.checksum();
-}
-
-template <typename ValueT>
-bool validateChecksum(const NanoGrid<ValueT> &grid, ChecksumMode mode)
-{
-    GridChecksum cs1(grid.checksum(), mode), cs2;
-    cs2(grid, cs1.mode() );
-    return cs1 == cs2;
-}
-
-template <typename ValueT>
-void updateChecksum(NanoGrid<ValueT> &grid, ChecksumMode mode)
-{
-    GridChecksum cs;
-    cs(grid, mode);
-    grid.data()->mChecksum = cs.checksum();
-}
-
-inline bool updateChecksum(GridData &gridData, ChecksumMode mode)
-{
-#ifdef NANOVDB_CRC32_LOG2_BLOCK_SIZE
-    GridChecksum cs;
-    cs(gridData, mode);
-    gridData.mChecksum = cs.checksum();
-#else
-    if (mode == ChecksumMode::Disable) return false;
-    switch (data->mGridType){
-        case GridType::Float:
-            updateChecksum(*reinterpret_cast<NanoGrid<float>*>(data), mode);
-            break;
-        case GridType::Double:
-            updateChecksum(*reinterpret_cast<NanoGrid<double>*>(data), mode);
-            break;
-        case GridType::Int16:
-            updateChecksum(*reinterpret_cast<NanoGrid<int16_t>*>(data), mode);
-            break;
-        case GridType::Int32:
-            updateChecksum(*reinterpret_cast<NanoGrid<int32_t>*>(data), mode);
-            break;
-        case GridType::Int64:
-            updateChecksum(*reinterpret_cast<NanoGrid<int64_t>*>(data), mode);
-            break;
-        case GridType::Vec3f:
-            updateChecksum(*reinterpret_cast<NanoGrid<Vec3f>*>(data), mode);
-            break;
-        case GridType::Vec3d:
-            updateChecksum(*reinterpret_cast<NanoGrid<Vec3d>*>(data), mode);
-            break;
-        case GridType::UInt32:
-            updateChecksum(*reinterpret_cast<NanoGrid<uint32_t>*>(data), mode);
-            break;
-        case GridType::Mask:
-            updateChecksum(*reinterpret_cast<NanoGrid<ValueMask>*>(data), mode);
-            break;
-        case GridType::Index:
-            updateChecksum(*reinterpret_cast<NanoGrid<ValueIndex>*>(data), mode);
-            break;
-        case GridType::OnIndex:
-            updateChecksum(*reinterpret_cast<NanoGrid<ValueOnIndex>*>(data), mode);
-            break;
-        case GridType::IndexMask:
-            updateChecksum(*reinterpret_cast<NanoGrid<ValueIndexMask>*>(data), mode);
-            break;
-        case GridType::OnIndexMask:
-            updateChecksum(*reinterpret_cast<NanoGrid<ValueOnIndexMask>*>(data), mode);
-            break;
-        case GridType::Boolean:
-            updateChecksum(*reinterpret_cast<NanoGrid<bool>*>(data), mode);
-            break;
-        case GridType::RGBA8:
-            updateChecksum(*reinterpret_cast<NanoGrid<Rgba8>*>(data), mode);
-            break;
-        case GridType::Fp4:
-            updateChecksum(*reinterpret_cast<NanoGrid<Fp4>*>(data), mode);
-            break;
-        case GridType::Fp8:
-            updateChecksum(*reinterpret_cast<NanoGrid<Fp8>*>(data), mode);
-            break;
-        case GridType::Fp16:
-            updateChecksum(*reinterpret_cast<NanoGrid<Fp16>*>(data), mode);
-            break;
-        case GridType::FpN:
-            updateChecksum(*reinterpret_cast<NanoGrid<FpN>*>(data), mode);
-            break;
-        case GridType::Vec4f:
-            updateChecksum(*reinterpret_cast<NanoGrid<Vec4f>*>(data), mode);
-            break;
-        case GridType::Vec4d:
-            updateChecksum(*reinterpret_cast<NanoGrid<Vec4d>*>(data), mode);
-            break;
-        default: {
-            std::stringstream ss;
-            ss << "Cannot update checksum for grid of unknown type \"" << toStr(data->mGridType);
-            throw std::runtime_error(ss.str() + "\"");
-        }
-    }// switch
-#endif
-    return true;
-}// updateChecksum(GridData *data, ChecksumMode mode)
-
-/// @brief Preserve the existing mode of the checksum and update it if it's not disabled
-/// @param data
-/// @return
-inline bool updateChecksum(GridData *data)
-{
-    GridChecksum cs(data->mChecksum);
-    const auto mode = cs.mode();
-    return updateChecksum(*data, mode);
-}// updateChecksum(GridData *data)
-
-/// @brief Updates the ground index and count, as well as the partial checksum if needed
-/// @param data Pointer to grid data
-/// @param gridIndex New value of the index
-/// @param gridCount New value of the grid count
-/// @return returns true if the checksum was updated
-inline bool updateGridCount(GridData *data, uint32_t gridIndex, uint32_t gridCount)
-{
-    NANOVDB_ASSERT(gridIndex < gridCount);
-    if (data->mGridIndex == gridIndex && data->mGridCount == gridCount) return false;// nothing to update
-    data->mGridIndex = gridIndex;
-    data->mGridCount = gridCount;
-    GridChecksum cs(data->mChecksum);
-    if (cs.isEmpty()) return false;// no checksum to update
-    updateChecksum(*data, ChecksumMode::Partial);// only update the checksum of the grid since we only modified the GridData
-    reinterpret_cast<GridChecksum*>(&(data->mChecksum))->checksum(1) = cs.checksum(1);// copy the old checksum of the tree nodes since it was set to EMPTY during the update
-    return true;
-}
-
-} // namespace nanovdb
-
-#endif // NANOVDB_GRIDCHECKSUM_H_HAS_BEEN_INCLUDED
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/GridChecksum.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/tools/GridChecksum.h instead.")
diff --git a/nanovdb/nanovdb/util/GridStats.h b/nanovdb/nanovdb/util/GridStats.h
index 267e7462e3..e84b14229b 100644
--- a/nanovdb/nanovdb/util/GridStats.h
+++ b/nanovdb/nanovdb/util/GridStats.h
@@ -1,855 +1,6 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-/*!
-    \file GridStats.h
-
-    \author Ken Museth
-
-    \date August 29, 2020
-
-    \brief Re-computes min/max/avg/var/bbox information for each node in a
-           pre-existing NanoVDB grid.
-*/
-
-#ifndef NANOVDB_GRIDSTATS_H_HAS_BEEN_INCLUDED
-#define NANOVDB_GRIDSTATS_H_HAS_BEEN_INCLUDED
-
-#include <nanovdb/NanoVDB.h>
-#include "Range.h"
-#include "ForEach.h"
-
-#ifdef NANOVDB_USE_TBB
-#include <tbb/parallel_reduce.h>
-#endif
-
-#if defined(__CUDACC__)
-#include <cuda/std/limits>// for cuda::std::numeric_limits
-#else
-#include <limits.h>// for std::numeric_limits
-#endif
-
-#include <atomic>
-#include <iostream>
-
-namespace nanovdb {
-
-/// @brief Grid flags which indicate what extra information is present in the grid buffer
-enum class StatsMode : uint32_t {
-    Disable = 0,// disable the computation of any type of statistics (obviously the FASTEST!)
-    BBox    = 1,// only compute the bbox of active values per node and total activeVoxelCount
-    MinMax  = 2,// additionally compute extrema values
-    All     = 3,// compute all of the statics, i.e. bbox, min/max, average and standard deviation
-    Default = 3,// default computational mode for statistics
-    End     = 4,
-};
-
-/// @brief Re-computes the min/max, stats and bbox information for an existing NanoVDB Grid
-///
-/// @param grid  Grid whose stats to update
-/// @param mode  Mode of computation for the statistics.
-template<typename BuildT>
-void gridStats(NanoGrid<BuildT>& grid, StatsMode mode = StatsMode::Default);
-
-//================================================================================================
-
-template<typename ValueT, int Rank = TensorTraits<ValueT>::Rank>
-class Extrema;
-
-/// @brief Template specialization of Extrema on scalar value types, i.e. rank = 0
-template<typename ValueT>
-class Extrema<ValueT, 0>
-{
-protected:
-    ValueT mMin, mMax;
-
-public:
-    using ValueType = ValueT;
-    __hostdev__ Extrema()
-#if defined(__CUDACC__)
-        : mMin(cuda::std::numeric_limits<ValueT>::max())
-        , mMax(cuda::std::numeric_limits<ValueT>::lowest())
-#else
-        : mMin(std::numeric_limits<ValueT>::max())
-        , mMax(std::numeric_limits<ValueT>::lowest())
-#endif
-    {
-    }
-    __hostdev__ Extrema(const ValueT& v)
-        : mMin(v)
-        , mMax(v)
-    {
-    }
-    __hostdev__ Extrema(const ValueT& a, const ValueT& b)
-        : mMin(a)
-        , mMax(b)
-    {
-    }
-    __hostdev__ Extrema& min(const ValueT& v)
-    {
-        if (v < mMin) mMin = v;
-        return *this;
-    }
-    __hostdev__ Extrema& max(const ValueT& v)
-    {
-        if (v > mMax) mMax = v;
-        return *this;
-    }
-    __hostdev__ Extrema& add(const ValueT& v)
-    {
-        this->min(v);
-        this->max(v);
-        return *this;
-    }
-    __hostdev__ Extrema& add(const ValueT& v, uint64_t) { return this->add(v); }
-    __hostdev__ Extrema& add(const Extrema& other)
-    {
-        this->min(other.mMin);
-        this->max(other.mMax);
-        return *this;
-    }
-    __hostdev__ const ValueT& min() const { return mMin; }
-    __hostdev__ const ValueT& max() const { return mMax; }
-    __hostdev__ operator bool() const { return mMin <= mMax; }
-    __hostdev__ static constexpr bool hasMinMax() { return !std::is_same<bool, ValueT>::value; }
-    __hostdev__ static constexpr bool hasAverage() { return false; }
-    __hostdev__ static constexpr bool hasStdDeviation() { return false; }
-    __hostdev__ static constexpr bool hasStats() { return !std::is_same<bool, ValueT>::value; }
-    __hostdev__ static constexpr size_t size() { return 0; }
-
-    template <typename NodeT>
-    __hostdev__ void setStats(NodeT &node) const
-    {
-        node.setMin(this->min());
-        node.setMax(this->max());
-    }
-}; // Extrema<T, 0>
-
-/// @brief Template specialization of Extrema on vector value types, i.e. rank = 1
-template<typename VecT>
-class Extrema<VecT, 1>
-{
-protected:
-    using Real = typename VecT::ValueType; // this works with both nanovdb and openvdb vectors
-    struct Pair
-    {
-        Real scalar;
-        VecT vector;
-
-        __hostdev__ Pair(Real s)// is only used by Extrema() default c-tor
-            : scalar(s)
-            , vector(s)
-        {
-        }
-        __hostdev__ Pair(const VecT& v)
-            : scalar(v.lengthSqr())
-            , vector(v)
-        {
-        }
-        __hostdev__ bool  operator<(const Pair& rhs) const { return scalar < rhs.scalar; }
-    } mMin, mMax;
-    __hostdev__ Extrema& add(const Pair& p)
-    {
-        if (p < mMin) mMin = p;
-        if (mMax < p) mMax = p;
-        return *this;
-    }
-
-public:
-    using ValueType = VecT;
-    __hostdev__ Extrema()
-#if defined(__CUDACC__)
-        : mMin(cuda::std::numeric_limits<Real>::max())
-        , mMax(cuda::std::numeric_limits<Real>::lowest())
-#else
-        : mMin(std::numeric_limits<Real>::max())
-        , mMax(std::numeric_limits<Real>::lowest())
-#endif
-    {
-    }
-    __hostdev__ Extrema(const VecT& v)
-        : mMin(v)
-        , mMax(v)
-    {
-    }
-    __hostdev__ Extrema(const VecT& a, const VecT& b)
-        : mMin(a)
-        , mMax(b)
-    {
-    }
-    __hostdev__ Extrema& min(const VecT& v)
-    {
-        Pair tmp(v);
-        if (tmp < mMin) mMin = tmp;
-        return *this;
-    }
-    __hostdev__ Extrema& max(const VecT& v)
-    {
-        Pair tmp(v);
-        if (mMax < tmp) mMax = tmp;
-        return *this;
-    }
-    __hostdev__ Extrema& add(const VecT& v) { return this->add(Pair(v)); }
-    __hostdev__ Extrema& add(const VecT& v, uint64_t) { return this->add(Pair(v)); }
-    __hostdev__ Extrema& add(const Extrema& other)
-    {
-        if (other.mMin < mMin) mMin = other.mMin;
-        if (mMax < other.mMax) mMax = other.mMax;
-        return *this;
-    }
-    __hostdev__ const VecT& min() const { return mMin.vector; }
-    __hostdev__ const VecT& max() const { return mMax.vector; }
-    __hostdev__ operator bool() const { return !(mMax < mMin); }
-    __hostdev__ static constexpr bool hasMinMax() { return !std::is_same<bool, Real>::value; }
-    __hostdev__ static constexpr bool hasAverage() { return false; }
-    __hostdev__ static constexpr bool hasStdDeviation() { return false; }
-    __hostdev__ static constexpr bool hasStats() { return !std::is_same<bool, Real>::value; }
-    __hostdev__ static constexpr size_t size() { return 0; }
-
-    template <typename NodeT>
-    __hostdev__ void setStats(NodeT &node) const
-    {
-        node.setMin(this->min());
-        node.setMax(this->max());
-    }
-}; // Extrema<T, 1>
-
-//================================================================================================
-
-template<typename ValueT, int Rank = TensorTraits<ValueT>::Rank>
-class Stats;
-
-/// @brief This class computes statistics (minimum value, maximum
-/// value, mean, variance and standard deviation) of a population
-/// of floating-point values.
-///
-/// @details variance = Mean[ (X-Mean[X])^2 ] = Mean[X^2] - Mean[X]^2,
-///          standard deviation = sqrt(variance)
-///
-/// @note This class employs incremental computation and double precision.
-template<typename ValueT>
-class Stats<ValueT, 0> : public Extrema<ValueT, 0>
-{
-protected:
-    using BaseT = Extrema<ValueT, 0>;
-    using RealT = double; // for accuracy the internal precission must be 64 bit floats
-    size_t mSize;
-    double mAvg, mAux;
-
-public:
-    using ValueType = ValueT;
-    __hostdev__ Stats()
-        : BaseT()
-        , mSize(0)
-        , mAvg(0.0)
-        , mAux(0.0)
-    {
-    }
-    __hostdev__ Stats(const ValueT& val)
-        : BaseT(val)
-        , mSize(1)
-        , mAvg(RealT(val))
-        , mAux(0.0)
-    {
-    }
-    /// @brief Add a single sample
-    __hostdev__ Stats& add(const ValueT& val)
-    {
-        BaseT::add(val);
-        mSize += 1;
-        const double delta = double(val) - mAvg;
-        mAvg += delta / double(mSize);
-        mAux += delta * (double(val) - mAvg);
-        return *this;
-    }
-    /// @brief Add @a n samples with constant value @a val.
-    __hostdev__ Stats& add(const ValueT& val, uint64_t n)
-    {
-        const double denom = 1.0 / double(mSize + n);
-        const double delta = double(val) - mAvg;
-        mAvg += denom * delta * double(n);
-        mAux += denom * delta * delta * double(mSize) * double(n);
-        BaseT::add(val);
-        mSize += n;
-        return *this;
-    }
-
-    /// Add the samples from the other Stats instance.
-    __hostdev__ Stats& add(const Stats& other)
-    {
-        if (other.mSize > 0) {
-            const double denom = 1.0 / double(mSize + other.mSize);
-            const double delta = other.mAvg - mAvg;
-            mAvg += denom * delta * double(other.mSize);
-            mAux += other.mAux + denom * delta * delta * double(mSize) * double(other.mSize);
-            BaseT::add(other);
-            mSize += other.mSize;
-        }
-        return *this;
-    }
-
-    __hostdev__ static constexpr bool hasMinMax() { return !std::is_same<bool, ValueT>::value; }
-    __hostdev__ static constexpr bool hasAverage() { return !std::is_same<bool, ValueT>::value; }
-    __hostdev__ static constexpr bool hasStdDeviation() { return !std::is_same<bool, ValueT>::value; }
-    __hostdev__ static constexpr bool hasStats() { return !std::is_same<bool, ValueT>::value; }
-
-    __hostdev__ size_t size() const { return mSize; }
-
-    //@{
-    /// Return the  arithmetic mean, i.e. average, value.
-    __hostdev__ double avg() const { return mAvg; }
-    __hostdev__ double mean() const { return mAvg; }
-    //@}
-
-    //@{
-    /// @brief Return the population variance.
-    ///
-    /// @note The unbiased sample variance = population variance * num/(num-1)
-    __hostdev__ double var() const { return mSize < 2 ? 0.0 : mAux / double(mSize); }
-    __hostdev__ double variance() const { return this->var(); }
-    //@}
-
-    //@{
-    /// @brief Return the standard deviation (=Sqrt(variance)) as
-    ///        defined from the (biased) population variance.
-    __hostdev__ double std() const { return sqrt(this->var()); }
-    __hostdev__ double stdDev() const { return this->std(); }
-    //@}
-
-    template <typename NodeT>
-    __hostdev__ void setStats(NodeT &node) const
-    {
-        node.setMin(this->min());
-        node.setMax(this->max());
-        node.setAvg(this->avg());
-        node.setDev(this->std());
-    }
-}; // end Stats<T, 0>
-
-/// @brief This class computes statistics (minimum value, maximum
-/// value, mean, variance and standard deviation) of a population
-/// of floating-point values.
-///
-/// @details variance = Mean[ (X-Mean[X])^2 ] = Mean[X^2] - Mean[X]^2,
-///          standard deviation = sqrt(variance)
-///
-/// @note This class employs incremental computation and double precision.
-template<typename ValueT>
-class Stats<ValueT, 1> : public Extrema<ValueT, 1>
-{
-protected:
-    using BaseT = Extrema<ValueT, 1>;
-    using RealT = double; // for accuracy the internal precision must be 64 bit floats
-    size_t mSize;
-    double mAvg, mAux;
-
-public:
-    using ValueType = ValueT;
-    __hostdev__ Stats()
-        : BaseT()
-        , mSize(0)
-        , mAvg(0.0)
-        , mAux(0.0)
-    {
-    }
-    /// @brief Add a single sample
-    __hostdev__ Stats& add(const ValueT& val)
-    {
-        typename BaseT::Pair tmp(val);
-        BaseT::add(tmp);
-        mSize += 1;
-        const double delta = tmp.scalar - mAvg;
-        mAvg += delta / double(mSize);
-        mAux += delta * (tmp.scalar - mAvg);
-        return *this;
-    }
-    /// @brief Add @a n samples with constant value @a val.
-    __hostdev__ Stats& add(const ValueT& val, uint64_t n)
-    {
-        typename BaseT::Pair tmp(val);
-        const double         denom = 1.0 / double(mSize + n);
-        const double         delta = tmp.scalar - mAvg;
-        mAvg += denom * delta * double(n);
-        mAux += denom * delta * delta * double(mSize) * double(n);
-        BaseT::add(tmp);
-        mSize += n;
-        return *this;
-    }
-
-    /// Add the samples from the other Stats instance.
-    __hostdev__ Stats& add(const Stats& other)
-    {
-        if (other.mSize > 0) {
-            const double denom = 1.0 / double(mSize + other.mSize);
-            const double delta = other.mAvg - mAvg;
-            mAvg += denom * delta * double(other.mSize);
-            mAux += other.mAux + denom * delta * delta * double(mSize) * double(other.mSize);
-            BaseT::add(other);
-            mSize += other.mSize;
-        }
-        return *this;
-    }
-
-    __hostdev__ static constexpr bool hasMinMax() { return !std::is_same<bool, ValueT>::value; }
-    __hostdev__ static constexpr bool hasAverage() { return !std::is_same<bool, ValueT>::value; }
-    __hostdev__ static constexpr bool hasStdDeviation() { return !std::is_same<bool, ValueT>::value; }
-    __hostdev__ static constexpr bool hasStats() { return !std::is_same<bool, ValueT>::value; }
-
-    __hostdev__ size_t size() const { return mSize; }
-
-    //@{
-    /// Return the  arithmetic mean, i.e. average, value.
-    __hostdev__ double avg() const { return mAvg; }
-    __hostdev__ double mean() const { return mAvg; }
-    //@}
-
-    //@{
-    /// @brief Return the population variance.
-    ///
-    /// @note The unbiased sample variance = population variance * num/(num-1)
-    __hostdev__ double var() const { return mSize < 2 ? 0.0 : mAux / double(mSize); }
-    __hostdev__ double variance() const { return this->var(); }
-    //@}
-
-    //@{
-    /// @brief Return the standard deviation (=Sqrt(variance)) as
-    ///        defined from the (biased) population variance.
-    __hostdev__ double std() const { return sqrt(this->var()); }
-    __hostdev__ double stdDev() const { return this->std(); }
-    //@}
-
-    template <typename NodeT>
-    __hostdev__ void setStats(NodeT &node) const
-    {
-        node.setMin(this->min());
-        node.setMax(this->max());
-        node.setAvg(this->avg());
-        node.setDev(this->std());
-    }
-}; // end Stats<T, 1>
-
-/// @brief No-op Stats class
-template<typename ValueT>
-struct NoopStats
-{
-    using ValueType = ValueT;
-    __hostdev__ NoopStats() {}
-    __hostdev__ NoopStats(const ValueT&) {}
-    __hostdev__ NoopStats& add(const ValueT&) { return *this; }
-    __hostdev__ NoopStats& add(const ValueT&, uint64_t) { return *this; }
-    __hostdev__ NoopStats& add(const NoopStats&) { return *this; }
-    __hostdev__ static constexpr size_t size() { return 0; }
-    __hostdev__ static constexpr bool hasMinMax() { return false; }
-    __hostdev__ static constexpr bool hasAverage() { return false; }
-    __hostdev__ static constexpr bool hasStdDeviation() { return false; }
-    __hostdev__ static constexpr bool hasStats() { return false; }
-    template <typename NodeT>
-    __hostdev__ void setStats(NodeT&) const{}
-}; // end NoopStats<T>
-
-//================================================================================================
-
-/// @brief Allows for the construction of NanoVDB grids without any dependency
-template<typename GridT, typename StatsT = Stats<typename GridT::ValueType>>
-class GridStats
-{
-    struct NodeStats;
-    using TreeT  = typename GridT::TreeType;
-    using ValueT = typename TreeT::ValueType;
-    using BuildT = typename TreeT::BuildType;
-    using Node0  = typename TreeT::Node0; // leaf
-    using Node1  = typename TreeT::Node1; // lower
-    using Node2  = typename TreeT::Node2; // upper
-    using RootT  = typename TreeT::Node3; // root
-    static_assert(std::is_same<ValueT, typename StatsT::ValueType>::value, "Mismatching type");
-
-    ValueT mDelta; // skip rendering of node if: node.max < -mDelta || node.min > mDelta
-
-    void process( GridT& );// process grid and all tree nodes
-    void process( TreeT& );// process Tree, root node and child nodes
-    void process( RootT& );// process root node and child nodes
-    NodeStats process( Node0& );// process leaf node
-
-    template<typename NodeT>
-    NodeStats process( NodeT& );// process internal node and child nodes
-
-    template<typename DataT, int Rank>
-    void setStats(DataT*, const Extrema<ValueT, Rank>&);
-    template<typename DataT, int Rank>
-    void setStats(DataT*, const Stats<ValueT, Rank>&);
-    template<typename DataT>
-    void setStats(DataT*, const NoopStats<ValueT>&) {}
-
-    template<typename T, typename FlagT>
-    typename std::enable_if<!std::is_floating_point<T>::value>::type
-    setFlag(const T&, const T&, FlagT& flag) const { flag &= ~FlagT(1); } // unset 1st bit to enable rendering
-
-    template<typename T, typename FlagT>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
-    setFlag(const T& min, const T& max, FlagT& flag) const;
-
-public:
-    GridStats() = default;
-
-    void operator()(GridT& grid, ValueT delta = ValueT(0));
-
-}; // GridStats
-
-template<typename GridT, typename StatsT>
-struct GridStats<GridT, StatsT>::NodeStats
-{
-    StatsT    stats;
-    CoordBBox bbox;
-
-    NodeStats(): stats(), bbox() {}//activeCount(0), bbox() {};
-
-    NodeStats& add(const NodeStats &other)
-    {
-        stats.add( other.stats );// no-op for NoopStats?!
-        bbox[0].minComponent(other.bbox[0]);
-        bbox[1].maxComponent(other.bbox[1]);
-        return *this;
-    }
-};// GridStats::NodeStats
-
-//================================================================================================
-
-template<typename GridT, typename StatsT>
-void GridStats<GridT, StatsT>::operator()(GridT& grid, ValueT delta)
-{
-    mDelta = delta; // delta = voxel size for level sets, else 0
-    this->process( grid );
-}
-
-//================================================================================================
-
-template<typename GridT, typename StatsT>
-template<typename DataT, int Rank>
-inline void GridStats<GridT, StatsT>::
-    setStats(DataT* data, const Extrema<ValueT, Rank>& e)
-{
-    data->setMin(e.min());
-    data->setMax(e.max());
-}
-
-template<typename GridT, typename StatsT>
-template<typename DataT, int Rank>
-inline void GridStats<GridT, StatsT>::
-    setStats(DataT* data, const Stats<ValueT, Rank>& s)
-{
-    data->setMin(s.min());
-    data->setMax(s.max());
-    data->setAvg(s.avg());
-    data->setDev(s.std());
-}
-
-//================================================================================================
-
-template<typename GridT, typename StatsT>
-template<typename T, typename FlagT>
-inline typename std::enable_if<std::is_floating_point<T>::value>::type
-GridStats<GridT, StatsT>::
-    setFlag(const T& min, const T& max, FlagT& flag) const
-{
-    if (mDelta > 0 && (min > mDelta || max < -mDelta)) {// LS: min > dx || max < -dx
-        flag |=  FlagT(1u);// set 1st bit to disable rendering
-    } else {
-        flag &= ~FlagT(1u);// unset 1st bit to enable rendering
-    }
-}
-
-//================================================================================================
-
-template<typename GridT, typename StatsT>
-void GridStats<GridT, StatsT>::process( GridT &grid )
-{
-    this->process( grid.tree() );// this processes tree, root and all nodes
-
-    // set world space AABB
-    auto& data = *grid.data();
-    const auto& indexBBox = grid.tree().root().bbox();
-    if (indexBBox.empty()) {
-        data.mWorldBBox = BBox<Vec3d>();
-        data.setBBoxOn(false);
-    } else {
-        // Note that below max is offset by one since CoordBBox.max is inclusive
-        // while bbox<Vec3d>.max is exclusive. However, min is inclusive in both
-        // CoordBBox and BBox<Vec3d>. This also guarantees that a grid with a single
-        // active voxel, does not have an empty world bbox! E.g. if a grid with a
-        // unit index-to-world transformation only contains the active voxel (0,0,0)
-        // then indeBBox = (0,0,0) -> (0,0,0) and then worldBBox = (0.0, 0.0, 0.0)
-        // -> (1.0, 1.0, 1.0). This is a consequence of the different definitions
-        // of index and world bounding boxes inherited from OpenVDB!
-        grid.mWorldBBox = CoordBBox(indexBBox[0], indexBBox[1].offsetBy(1)).transform(grid.map());
-        grid.setBBoxOn(true);
-    }
-
-    // set bit flags
-    data.setMinMaxOn(StatsT::hasMinMax());
-    data.setAverageOn(StatsT::hasAverage());
-    data.setStdDeviationOn(StatsT::hasStdDeviation());
-} // GridStats::process( Grid )
-
-//================================================================================================
-
-template<typename GridT, typename StatsT>
-inline void GridStats<GridT, StatsT>::process( typename GridT::TreeType &tree )
-{
-    this->process( tree.root() );
-}
-
-//================================================================================================
-
-template<typename GridT, typename StatsT>
-void GridStats<GridT, StatsT>::process(RootT &root)
-{
-    using ChildT = Node2;
-    auto     &data = *root.data();
-    if (data.mTableSize == 0) { // empty root node
-        data.mMinimum = data.mMaximum = data.mBackground;
-        data.mAverage = data.mStdDevi = 0;
-        data.mBBox = CoordBBox();
-    } else {
-        NodeStats total;
-        for (uint32_t i = 0; i < data.mTableSize; ++i) {
-            auto* tile = data.tile(i);
-            if (tile->isChild()) { // process child node
-                total.add( this->process( *data.getChild(tile) ) );
-            } else if (tile->state) { // active tile
-                const Coord ijk = tile->origin();
-                total.bbox[0].minComponent(ijk);
-                total.bbox[1].maxComponent(ijk + Coord(ChildT::DIM - 1));
-                if (StatsT::hasStats()) { // resolved at compile time
-                    total.stats.add(tile->value, ChildT::NUM_VALUES);
-                }
-            }
-        }
-        this->setStats(&data, total.stats);
-        if (total.bbox.empty()) {
-            std::cerr << "\nWarning in GridStats: input tree only contained inactive root tiles!"
-                      << "\nWhile not strictly an error it's rather suspicious!\n";
-        }
-        data.mBBox = total.bbox;
-    }
-} // GridStats::process( RootNode )
-
-//================================================================================================
-
-template<typename GridT, typename StatsT>
-template<typename NodeT>
-typename GridStats<GridT, StatsT>::NodeStats
-GridStats<GridT, StatsT>::process(NodeT &node)
-{
-    static_assert(is_same<NodeT,Node1>::value || is_same<NodeT,Node2>::value, "Incorrect node type");
-    using ChildT = typename NodeT::ChildNodeType;
-
-    NodeStats total;
-    auto* data = node.data();
-
-    // Serial processing of active tiles
-    if (const auto tileCount = data->mValueMask.countOn()) {
-        //total.activeCount = tileCount * ChildT::NUM_VALUES; // active tiles
-        for (auto it = data->mValueMask.beginOn(); it; ++it) {
-            if (StatsT::hasStats()) { // resolved at compile time
-                total.stats.add( data->mTable[*it].value, ChildT::NUM_VALUES );
-            }
-            const Coord ijk = node.offsetToGlobalCoord(*it);
-            total.bbox[0].minComponent(ijk);
-            total.bbox[1].maxComponent(ijk + Coord(int32_t(ChildT::DIM) - 1));
-        }
-    }
-
-    // Serial or parallel processing of child nodes
-    if (const size_t childCount = data->mChildMask.countOn()) {
-#ifndef NANOVDB_USE_TBB
-        for (auto it = data->mChildMask.beginOn(); it; ++it) {
-            total.add( this->process( *data->getChild(*it) ) );
-        }
-#else
-        std::unique_ptr<ChildT*[]> childNodes(new ChildT*[childCount]);
-        ChildT **ptr = childNodes.get();
-        for (auto it = data->mChildMask.beginOn(); it; ++it) {
-            *ptr++ = data->getChild( *it );
-        }
-        using RangeT = tbb::blocked_range<size_t>;
-        total.add( tbb::parallel_reduce(RangeT(0, childCount), NodeStats(),
-            [&](const RangeT &r, NodeStats local)->NodeStats {
-                for(size_t i=r.begin(); i!=r.end(); ++i){
-                    local.add( this->process( *childNodes[i] ) );
-                }
-                return local;},
-            [](NodeStats a, const NodeStats &b)->NodeStats { return a.add( b ); }
-        ));
-#endif
-    }
-
-    data->mBBox = total.bbox;
-    if (total.bbox.empty()) {
-        data->mFlags |=  uint32_t(1); // set 1st bit on to disable rendering of node
-        data->mFlags &= ~uint32_t(2); // set 2nd bit off since node does not contain active values
-    } else {
-        data->mFlags |=  uint32_t(2); // set 2nd bit on since node contains active values
-        if (StatsT::hasStats()) { // resolved at compile time
-            this->setStats(data, total.stats);
-            this->setFlag(data->mMinimum, data->mMaximum, data->mFlags);
-        }
-    }
-    return total;
-} // GridStats::process( InternalNode )
-
-//================================================================================================
-
-template<typename GridT, typename StatsT>
-typename GridStats<GridT, StatsT>::NodeStats
-GridStats<GridT, StatsT>::process(Node0 &leaf)
-{
-    NodeStats local;
-    if (leaf.updateBBox()) {// optionally update active bounding box (updates data->mFlags)
-        local.bbox[0] = local.bbox[1] = leaf.mBBoxMin;
-        local.bbox[1] += Coord(leaf.mBBoxDif[0], leaf.mBBoxDif[1], leaf.mBBoxDif[2]);
-        if (StatsT::hasStats()) {// resolved at compile time
-            for (auto it = leaf.cbeginValueOn(); it; ++it) local.stats.add(*it);
-            this->setStats(&leaf, local.stats);
-            this->setFlag(leaf.getMin(), leaf.getMax(), leaf.mFlags);
-        }
-    }
-    return local;
-} // GridStats::process( LeafNode )
-
-//================================================================================================
-
-template<typename BuildT>
-void gridStats(NanoGrid<BuildT>& grid, StatsMode mode)
-{
-    using GridT  = NanoGrid<BuildT>;
-    using ValueT = typename GridT::ValueType;
-    if (mode == StatsMode::Disable) {
-        return;
-    } else if (mode == StatsMode::BBox || std::is_same<bool, ValueT>::value) {
-        GridStats<GridT, NoopStats<ValueT> > stats;
-        stats(grid);
-    } else if (mode == StatsMode::MinMax) {
-        GridStats<GridT, Extrema<ValueT> > stats;
-        stats(grid);
-    } else if (mode == StatsMode::All) {
-        GridStats<GridT, Stats<ValueT> > stats;
-        stats(grid);
-    } else {
-        throw std::runtime_error("gridStats: Unsupported statistics mode.");
-    }
-}// gridStats
-
-//================================================================================================
-
-namespace {
-
-// returns a bitmask (of size 32^3 or 16^3) that marks all the entries
-// in a node table that intersects with the specified bounding box.
-template<typename NodeT>
-Mask<NodeT::LOG2DIM> getBBoxMask(const CoordBBox &bbox, const NodeT* node)
-{
-    Mask<NodeT::LOG2DIM> mask;// typically 32^3 or 16^3 bit mask
-    auto b = CoordBBox::createCube(node->origin(), node->dim());
-    assert( bbox.hasOverlap(b) );
-    if ( bbox.isInside(b) ) {
-        mask.setOn();//node is completely inside the bbox so early out
-    } else {
-        b.intersect(bbox);// trim bounding box
-        // transform bounding box from global to local coordinates
-        b.min() &=  NodeT::DIM-1u;
-        b.min() >>= NodeT::ChildNodeType::TOTAL;
-        b.max() &=  NodeT::DIM-1u;
-        b.max() >>= NodeT::ChildNodeType::TOTAL;
-        assert( !b.empty() );
-        auto it = b.begin();// iterates over all the child nodes or tiles that intersects bbox
-        for (const Coord& ijk = *it; it; ++it) {
-            mask.setOn(ijk[2] + (ijk[1] << NodeT::LOG2DIM) + (ijk[0] << 2*NodeT::LOG2DIM));
-        }
-    }
-    return mask;
-}// getBBoxMask
-
-}// end of unnamed namespace
-
-/// @brief return the extrema of all the values in a grid that
-///        intersects the specified bounding box.
-template<typename BuildT>
-Extrema<typename NanoGrid<BuildT>::ValueType>
-getExtrema(const NanoGrid<BuildT>& grid, const CoordBBox &bbox)
-{
-    using GridT  = NanoGrid<BuildT>;
-    using ValueT = typename GridT::ValueType;
-    using TreeT = typename GridTree<GridT>::type;
-    using RootT = typename NodeTrait<TreeT, 3>::type;// root node
-    using Node2 = typename NodeTrait<TreeT, 2>::type;// upper internal node
-    using Node1 = typename NodeTrait<TreeT, 1>::type;// lower internal node
-    using Node0 = typename NodeTrait<TreeT, 0>::type;// leaf node
-
-    Extrema<ValueT> extrema;
-    const RootT &root = grid.tree().root();
-    const auto &bbox3 = root.bbox();
-    if (bbox.isInside(bbox3)) {// bbox3 is contained inside bbox
-        extrema.min(root.minimum());
-        extrema.max(root.maximum());
-        extrema.add(root.background());
-    } else if (bbox.hasOverlap(bbox3)) {
-        const auto *data3 = root.data();
-        for (uint32_t i=0; i<data3->mTableSize; ++i) {
-            const auto *tile = data3->tile(i);
-            CoordBBox bbox2 = CoordBBox::createCube(tile->origin(), Node2::dim());
-            if (!bbox.hasOverlap(bbox2)) continue;
-            if (tile->isChild()) {
-                const Node2 *node2 = data3->getChild(tile);
-                if (bbox.isInside(bbox2)) {
-                    extrema.min(node2->minimum());
-                    extrema.max(node2->maximum());
-                } else {// partial intersections at level 2
-                    auto *data2 = node2->data();
-                    const auto bboxMask2 = getBBoxMask(bbox, node2);
-                    for (auto it2 = bboxMask2.beginOn(); it2; ++it2) {
-                        if (data2->mChildMask.isOn(*it2)) {
-                            const Node1* node1 = data2->getChild(*it2);
-                            CoordBBox bbox1 = CoordBBox::createCube(node1->origin(), Node1::dim());
-                            if (bbox.isInside(bbox1)) {
-                                extrema.min(node1->minimum());
-                                extrema.max(node1->maximum());
-                            } else {// partial intersection at level 1
-                                auto *data1 = node1->data();
-                                const auto bboxMask1 = getBBoxMask(bbox, node1);
-                                for (auto it1 = bboxMask1.beginOn(); it1; ++it1) {
-                                    if (data1->mChildMask.isOn(*it1)) {
-                                        const Node0* node0 = data1->getChild(*it1);
-                                        CoordBBox bbox0 = CoordBBox::createCube(node0->origin(), Node0::dim());
-                                        if (bbox.isInside(bbox0)) {
-                                            extrema.min(node0->minimum());
-                                            extrema.max(node0->maximum());
-                                        } else {// partial intersection at level 0
-                                            auto *data0 = node0->data();
-                                            const auto bboxMask0 = getBBoxMask(bbox, node0);
-                                            for (auto it0 = bboxMask0.beginOn(); it0; ++it0) {
-                                                extrema.add(data0->getValue(*it0));
-                                            }
-                                        }// end partial intersection at level 0
-                                    } else {// tile at level 1
-                                        extrema.add(data1->mTable[*it1].value);
-                                    }
-                                }
-                            }// end of partial intersection at level 1
-                        } else {// tile at level 2
-                           extrema.add(data2->mTable[*it2].value);
-                        }
-                    }// loop over tiles and nodes at level 2
-                }// end of partial intersection at level 1
-            } else {// tile at root level
-                extrema.add(tile->value);
-            }
-        }// loop over root table
-    } else {// bbox does not overlap the grid
-        extrema.add(root.background());
-    }
-    return extrema;
-}// getExtrema
-
-} // namespace nanovdb
-
-#endif // NANOVDB_GRIDSTATS_H_HAS_BEEN_INCLUDED
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/GridStats.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/tools/GridStats.h instead.")
diff --git a/nanovdb/nanovdb/util/GridValidator.h b/nanovdb/nanovdb/util/GridValidator.h
index fe6815bfb4..476e760d4e 100644
--- a/nanovdb/nanovdb/util/GridValidator.h
+++ b/nanovdb/nanovdb/util/GridValidator.h
@@ -1,185 +1,6 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-/*!
-    \file GridValidator.h
-
-    \author Ken Museth
-
-    \date August 30, 2020
-
-    \brief Checks the validity of an existing NanoVDB grid.
-*/
-
-#ifndef NANOVDB_GRIDVALIDATOR_H_HAS_BEEN_INCLUDED
-#define NANOVDB_GRIDVALIDATOR_H_HAS_BEEN_INCLUDED
-
-#include <nanovdb/NanoVDB.h>
-#include "GridChecksum.h"
-
-namespace nanovdb {
-
-/// @brief Return true if the specified grid passes several validation tests.
-///
-/// @param grid Grid to validate
-/// @param detailed If true the validation test is detailed and relatively slow.
-/// @param verbose If true information about the first failed test is printed to std::cerr
-template <typename ValueT>
-bool isValid(const NanoGrid<ValueT> &grid, bool detailed = true, bool verbose = false);
-
-/// @brief Allows for the construction of NanoVDB grids without any dependecy
-template <typename ValueT>
-class GridValidator
-{
-    using GridT = NanoGrid<ValueT>;
-    inline static void checkTree( const GridT&, std::string&, bool);
-    inline static void checkRoot( const GridT&, std::string&, bool);
-    inline static void checkNodes(const GridT&, std::string&);
-
-public:
-    /// @brief Returns an error message (an empty string means no error)
-    ///
-    /// @param grid NanoVDB grid to be tested
-    /// @param detailed If true the checksum is computed and validated as well as all the node pointers
-    ///
-    /// @note The validation is much slower if @c detailed == true!
-    static std::string check(const GridT &grid, bool detailed = true);
-
-};// GridValidator
-
-//================================================================================================
-
-template <typename ValueT>
-std::string GridValidator<ValueT>::check(const GridT &grid, bool detailed)
-{
-    std::string errorStr;
-
-    // First check the Grid
-    auto *data = reinterpret_cast<const typename GridT::DataType*>(&grid);
-    std::stringstream ss;
-    if (!isValid(data)) {
-        errorStr.assign("Grid is not 32B aligned");
-    } else if (data->mMagic != NANOVDB_MAGIC_NUMBER && data->mMagic != NANOVDB_MAGIC_GRID) {
-        const uint64_t magic1 = NANOVDB_MAGIC_NUMBER, magic2 = NANOVDB_MAGIC_GRID;
-        const char *c0 = (const char*)&(data->mMagic), *c1=(const char*)&magic1, *c2=(const char*)&magic2;
-        ss << "Incorrect magic number: Expected \"";
-        for (int i=0; i<8; ++i) ss << c1[i];
-        ss << "\" or \"";
-        for (int i=0; i<8; ++i) ss << c2[i];
-        ss << "\", but found \"";
-        for (int i=0; i<8; ++i) ss << c0[i];
-        ss << "\"";
-        errorStr = ss.str();
-    } else if (!validateChecksum(grid, detailed ? ChecksumMode::Full : ChecksumMode::Partial)) {
-        errorStr.assign("Mis-matching checksum");
-    } else if (data->mVersion >= Version(29,0,0) && data->mVersion.getMajor() != NANOVDB_MAJOR_VERSION_NUMBER) {
-        ss << "Invalid major version number: Expected " << NANOVDB_MAJOR_VERSION_NUMBER << ", but read " << data->mVersion.c_str();
-        errorStr = ss.str();
-    } else if (data->mVersion < Version(29,0,0) && data->mVersion.id() != 28u) {
-        ss << "Invalid old major version number: Expected 28 or newer, but read " << data->mVersion.id();
-        errorStr = ss.str();
-    } else if (data->mGridClass >= GridClass::End) {
-        errorStr.assign("Invalid GridClass");
-     } else if (data->mGridType >= GridType::End) {
-        errorStr.assign("Invalid GridType");
-    } else if (data->mGridType != mapToGridType<ValueT>()) {
-        errorStr.assign("Invalid combination of ValueType and GridType");
-    } else if (!isValid(data->mGridType, data->mGridClass)) {
-        errorStr.assign("Invalid combination of GridType and GridClass");
-    } else if ( (const uint8_t*)(&(grid.tree())) != (const uint8_t*)(&grid+1) ) {
-        errorStr.assign("Invalid Tree pointer");
-    } else {
-        checkTree(grid, errorStr, detailed);
-    }
-    return errorStr;
-}
-
-//================================================================================================
-
-template<typename ValueT>
-void GridValidator<ValueT>::checkTree(const GridT &grid, std::string &errorStr, bool detailed)
-{
-    if (!isValid(&grid.tree())) {
-        errorStr.assign("Tree is not 32B aligned");
-    } else if ( (const uint8_t*)(&grid.tree().root()) < (const uint8_t*)(&grid.tree()+1)) {
-       errorStr.assign("Invalid root pointer (should be located after the Grid and Tree)");
-    } else if ( (const uint8_t*)(&grid.tree().root()) > (const uint8_t*)(&grid) + grid.gridSize() - sizeof(grid.tree().root()) ) {
-       errorStr.assign("Invalid root pointer (appears to be located after the end of the buffer)");
-    } else {
-       checkRoot(grid, errorStr, detailed);
-    }
-}// GridValidator::checkTree
-
-//================================================================================================
-
-template<typename ValueT>
-void GridValidator<ValueT>::checkRoot(const GridT &grid, std::string &errorStr, bool detailed)
-{
-    auto &root = grid.tree().root();
-    auto *data = root.data();
-    if (!isValid(data)) {
-        errorStr.assign("Root is not 32B aligned");
-    }
-    const uint8_t *minPtr = (const uint8_t*)(&root + 1);
-    const uint8_t *maxPtr = (const uint8_t*)(&root) + root.memUsage();
-    for (uint32_t i = 0; errorStr.empty() && i<data->mTableSize; ++i) {
-        const auto *tile = data->tile(i);
-        if ( (const uint8_t *) tile < minPtr ) {
-            errorStr.assign("Invalid root tile pointer (below lower bound");
-        } else if ( (const uint8_t *) tile > maxPtr - sizeof(*tile) ) {
-            errorStr.assign("Invalid root tile pointer (above higher bound");
-        }
-    }
-    if (detailed && errorStr.empty()) {
-        checkNodes(grid, errorStr);
-    }
-}// GridValidator::processRoot
-
-//================================================================================================
-template<typename ValueT>
-void GridValidator<ValueT>::checkNodes(const GridT &grid, std::string &errorStr)
-{
-    auto &root = grid.tree().root();// note, the root node was already checked
-    const uint8_t *minPtr = (const uint8_t*)(&root) + root.memUsage();
-    const uint8_t *maxPtr = (const uint8_t*)(&grid) + grid.gridSize();
-
-    auto check = [&](const void * ptr, size_t ptrSize) -> bool {
-        if (!isValid(ptr)) {
-            errorStr.assign("Invalid node pointer: not 32B aligned");
-        } else if ( (const uint8_t *) ptr < minPtr ) {
-            errorStr.assign("Invalid node pointer: below lower bound");
-        } else if ( (const uint8_t *) ptr > maxPtr - ptrSize ) {
-            errorStr.assign("Invalid node pointer: above higher bound");
-        }
-        return errorStr.empty();
-    };
-
-    for (auto it2 = grid.tree().root().cbeginChild(); it2; ++it2) {
-        auto &node2 = *it2;
-        if (!check(&node2, sizeof(node2))) return;
-        for (auto it1 = node2.cbeginChild(); it1; ++it1) {
-            auto &node1 = *it1;
-            if (!check(&node1, sizeof(node1))) return;
-            for (auto it0 = node1.cbeginChild(); it0; ++it0) {
-                auto &node0 = *it0;
-                if (!check(&node2, sizeof(node2))) return;
-            }// loop over child nodes of the lower internal node
-        }// loop over child nodes of the upper internal node
-    }// loop over child nodes of the root node
-
-} // GridValidator::processNodes
-
-
-//================================================================================================
-
-template <typename ValueT>
-bool isValid(const NanoGrid<ValueT> &grid, bool detailed, bool verbose)
-{
-    const std::string str = GridValidator<ValueT>::check( grid, detailed );
-    if (verbose && !str.empty()) std::cerr << "Validation failed: " << str << std::endl;
-    return str.empty();
-}
-
-} // namespace nanovdb
-
-#endif // NANOVDB_GRIDVALIDATOR_H_HAS_BEEN_INCLUDED
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/GridValidator.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/tools/GridValidator.h instead.")
diff --git a/nanovdb/nanovdb/util/HDDA.h b/nanovdb/nanovdb/util/HDDA.h
index d3ef5733e0..4430c40701 100644
--- a/nanovdb/nanovdb/util/HDDA.h
+++ b/nanovdb/nanovdb/util/HDDA.h
@@ -1,510 +1,6 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-/// @file HDDA.h
-///
-/// @author Ken Museth
-///
-/// @brief Hierarchical Digital Differential Analyzers specialized for VDB.
-
-#ifndef NANOVDB_HDDA_H_HAS_BEEN_INCLUDED
-#define NANOVDB_HDDA_H_HAS_BEEN_INCLUDED
-
-// Comment out to disable this explicit round-off check
-#define ENFORCE_FORWARD_STEPPING
-
-#include <nanovdb/NanoVDB.h> // only dependency
-
-namespace nanovdb {
-
-/// @brief A Digital Differential Analyzer specialized for OpenVDB grids
-/// @note Conceptually similar to Bresenham's line algorithm applied
-/// to a 3D Ray intersecting OpenVDB nodes or voxels. Log2Dim = 0
-/// corresponds to a voxel and Log2Dim a tree node of size 2^Log2Dim.
-///
-/// @note The Ray template class is expected to have the following
-/// methods: test(time), t0(), t1(), invDir(), and  operator()(time).
-/// See the example Ray class above for their definition.
-template<typename RayT, typename CoordT = Coord>
-class HDDA
-{
-public:
-    using RealType = typename RayT::RealType;
-    using RealT = RealType;
-    using Vec3Type = typename RayT::Vec3Type;
-    using Vec3T = Vec3Type;
-    using CoordType = CoordT;
-
-    /// @brief Default ctor
-    HDDA() = default;
-
-    /// @brief ctor from ray and dimension at which the DDA marches
-    __hostdev__ HDDA(const RayT& ray, int dim) { this->init(ray, dim); }
-
-    /// @brief Re-initializes the HDDA
-    __hostdev__ void init(const RayT& ray, RealT startTime, RealT maxTime, int dim)
-    {
-        assert(startTime <= maxTime);
-        mDim = dim;
-        mT0 = startTime;
-        mT1 = maxTime;
-        const Vec3T &pos = ray(mT0), &dir = ray.dir(), &inv = ray.invDir();
-        mVoxel = RoundDown<CoordT>(pos) & (~(dim - 1));
-        for (int axis = 0; axis < 3; ++axis) {
-            if (dir[axis] == RealT(0)) { //handles dir = +/- 0
-                mNext[axis] = Maximum<RealT>::value(); //i.e. disabled!
-                mStep[axis] = 0;
-            } else if (inv[axis] > 0) {
-                mStep[axis] = 1;
-                mNext[axis] = mT0 + (mVoxel[axis] + dim - pos[axis]) * inv[axis];
-                mDelta[axis] = inv[axis];
-            } else {
-                mStep[axis] = -1;
-                mNext[axis] = mT0 + (mVoxel[axis] - pos[axis]) * inv[axis];
-                mDelta[axis] = -inv[axis];
-            }
-        }
-    }
-
-    /// @brief Simular to init above except it uses the bounds of the input ray
-    __hostdev__ void init(const RayT& ray, int dim) { this->init(ray, ray.t0(), ray.t1(), dim); }
-
-    /// @brief Updates the HDDA to march with the specified dimension
-    __hostdev__ bool update(const RayT& ray, int dim)
-    {
-        if (mDim == dim)
-            return false;
-        mDim = dim;
-        const Vec3T &pos = ray(mT0), &inv = ray.invDir();
-        mVoxel = RoundDown<CoordT>(pos) & (~(dim - 1));
-        for (int axis = 0; axis < 3; ++axis) {
-            if (mStep[axis] == 0)
-                continue;
-            mNext[axis] = mT0 + (mVoxel[axis] - pos[axis]) * inv[axis];
-            if (mStep[axis] > 0)
-                mNext[axis] += dim * inv[axis];
-        }
-
-        return true;
-    }
-
-    __hostdev__ int dim() const { return mDim; }
-
-    /// @brief Increment the voxel index to next intersected voxel or node
-    /// and returns true if the step in time does not exceed maxTime.
-    __hostdev__ bool step()
-    {
-        const int axis = MinIndex(mNext);
-#if 1
-        switch (axis) {
-        case 0:
-            return step<0>();
-        case 1:
-            return step<1>();
-        default:
-            return step<2>();
-        }
-#else
-        mT0 = mNext[axis];
-        mNext[axis] += mDim * mDelta[axis];
-        mVoxel[axis] += mDim * mStep[axis];
-        return mT0 <= mT1;
-#endif
-    }
-
-    /// @brief Return the index coordinates of the next node or voxel
-    /// intersected by the ray. If Log2Dim = 0 the return value is the
-    /// actual signed coordinate of the voxel, else it is the origin
-    /// of the corresponding VDB tree node or tile.
-    /// @note Incurs no computational overhead.
-    __hostdev__ const CoordT& voxel() const { return mVoxel; }
-
-    /// @brief Return the time (parameterized along the Ray) of the
-    /// first hit of a tree node of size 2^Log2Dim.
-    /// @details This value is initialized to startTime or ray.t0()
-    /// depending on the constructor used.
-    /// @note Incurs no computational overhead.
-    __hostdev__ RealType time() const { return mT0; }
-
-    /// @brief Return the maximum time (parameterized along the Ray).
-    __hostdev__ RealType maxTime() const { return mT1; }
-
-    /// @brief Return the time (parameterized along the Ray) of the
-    /// second (i.e. next) hit of a tree node of size 2^Log2Dim.
-    /// @note Incurs a (small) computational overhead.
-    __hostdev__ RealType next() const
-    {
-#if 1 //def __CUDA_ARCH__
-        return fminf(mT1, fminf(mNext[0], fminf(mNext[1], mNext[2])));
-#else
-        return std::min(mT1, std::min(mNext[0], std::min(mNext[1], mNext[2])));
-#endif
-    }
-
-private:
-    // helper to implement the general form
-    template<int axis>
-    __hostdev__ bool step()
-    {
-#ifdef ENFORCE_FORWARD_STEPPING
-        //if (mNext[axis] <= mT0) mNext[axis] += mT0 - mNext[axis] + fmaxf(mNext[axis]*1.0e-6f, 1.0e-6f);
-        //if (mNext[axis] <= mT0) mNext[axis] += mT0 - mNext[axis] + (mNext[axis] + 1.0f)*1.0e-6f;
-        if (mNext[axis] <= mT0) {
-            mNext[axis] += mT0 - 0.999999f * mNext[axis] + 1.0e-6f;
-        }
-#endif
-        mT0 = mNext[axis];
-        mNext[ axis] += mDim * mDelta[axis];
-        mVoxel[axis] += mDim * mStep[ axis];
-        return mT0 <= mT1;
-    }
-
-    int32_t mDim;
-    RealT   mT0, mT1; // min and max allowed times
-    CoordT  mVoxel, mStep; // current voxel location and step to next voxel location
-    Vec3T   mDelta, mNext; // delta time and next time
-}; // class HDDA
-
-/////////////////////////////////////////// ZeroCrossing ////////////////////////////////////////////
-
-/// @brief returns true if the ray intersects a zero-crossing at the voxel level of the grid in the accessor
-///        The empty-space ray-marching is performed at all levels of the tree using an
-///        HDDA. If an intersection is detected, then ijk is updated with the index coordinate of the closest
-///        voxel after the intersection point, v contains the grid values at ijk, and t is set to the time of
-///        the intersection along the ray.
-template<typename RayT, typename AccT>
-inline __hostdev__ bool ZeroCrossing(RayT& ray, AccT& acc, Coord& ijk, typename AccT::ValueType& v, float& t)
-{
-    if (!ray.clip(acc.root().bbox()) || ray.t1() > 1e20)
-        return false; // clip ray to bbox
-    static const float Delta = 1.0001f;
-    ijk = RoundDown<Coord>(ray.start()); // first hit of bbox
-    HDDA<RayT, Coord> hdda(ray, acc.getDim(ijk, ray));
-    const auto        v0 = acc.getValue(ijk);
-    while (hdda.step()) {
-        ijk = RoundDown<Coord>(ray(hdda.time() + Delta));
-        hdda.update(ray, acc.getDim(ijk, ray));
-        if (hdda.dim() > 1 || !acc.isActive(ijk))
-            continue; // either a tile value or an inactive voxel
-        while (hdda.step() && acc.isActive(hdda.voxel())) { // in the narrow band
-            v = acc.getValue(hdda.voxel());
-            if (v * v0 < 0) { // zero crossing
-                ijk = hdda.voxel();
-                t = hdda.time();
-                return true;
-            }
-        }
-    }
-    return false;
-}
-
-/////////////////////////////////////////// DDA ////////////////////////////////////////////
-
-/// @brief A Digital Differential Analyzer. Unlike HDDA (defined above) this DDA
-///        uses a fixed step-size defined by the template parameter Dim!
-///
-/// @note The Ray template class is expected to have the following
-/// methods: test(time), t0(), t1(), invDir(), and  operator()(time).
-/// See the example Ray class above for their definition.
-template<typename RayT, typename CoordT = Coord, int Dim = 1>
-class DDA
-{
-    static_assert(Dim >= 1, "Dim must be >= 1");
-
-public:
-    using RealType = typename RayT::RealType;
-    using RealT = RealType;
-    using Vec3Type = typename RayT::Vec3Type;
-    using Vec3T = Vec3Type;
-    using CoordType = CoordT;
-
-    /// @brief Default ctor
-    DDA() = default;
-
-    /// @brief ctor from ray and dimension at which the DDA marches
-    __hostdev__ DDA(const RayT& ray) { this->init(ray); }
-
-    /// @brief Re-initializes the DDA
-    __hostdev__ void init(const RayT& ray, RealT startTime, RealT maxTime)
-    {
-        assert(startTime <= maxTime);
-        mT0 = startTime;
-        mT1 = maxTime;
-        const Vec3T &pos = ray(mT0), &dir = ray.dir(), &inv = ray.invDir();
-        mVoxel = RoundDown<CoordT>(pos) & (~(Dim - 1));
-        for (int axis = 0; axis < 3; ++axis) {
-            if (dir[axis] == RealT(0)) { //handles dir = +/- 0
-                mNext[axis] = Maximum<RealT>::value(); //i.e. disabled!
-                mStep[axis] = 0;
-            } else if (inv[axis] > 0) {
-                mStep[axis] = Dim;
-                mNext[axis] = (mT0 + (mVoxel[axis] + Dim - pos[axis]) * inv[axis]);
-                mDelta[axis] = inv[axis];
-            } else {
-                mStep[axis] = -Dim;
-                mNext[axis] = mT0 + (mVoxel[axis] - pos[axis]) * inv[axis];
-                mDelta[axis] = -inv[axis];
-            }
-        }
-    }
-
-    /// @brief Simular to init above except it uses the bounds of the input ray
-    __hostdev__ void init(const RayT& ray) { this->init(ray, ray.t0(), ray.t1()); }
-
-    /// @brief Increment the voxel index to next intersected voxel or node
-    /// and returns true if the step in time does not exceed maxTime.
-    __hostdev__ bool step()
-    {
-        const int axis = MinIndex(mNext);
-#if 1
-        switch (axis) {
-        case 0:
-            return step<0>();
-        case 1:
-            return step<1>();
-        default:
-            return step<2>();
-        }
-#else
-#ifdef ENFORCE_FORWARD_STEPPING
-        if (mNext[axis] <= mT0) {
-            mNext[axis] += mT0 - 0.999999f * mNext[axis] + 1.0e-6f;
-        }
-#endif
-        mT0 = mNext[axis];
-        mNext[axis] += mDelta[axis];
-        mVoxel[axis] += mStep[axis];
-        return mT0 <= mT1;
-#endif
-    }
-
-    /// @brief Return the index coordinates of the next node or voxel
-    /// intersected by the ray. If Log2Dim = 0 the return value is the
-    /// actual signed coordinate of the voxel, else it is the origin
-    /// of the corresponding VDB tree node or tile.
-    /// @note Incurs no computational overhead.
-    __hostdev__ const CoordT& voxel() const { return mVoxel; }
-
-    /// @brief Return the time (parameterized along the Ray) of the
-    /// first hit of a tree node of size 2^Log2Dim.
-    /// @details This value is initialized to startTime or ray.t0()
-    /// depending on the constructor used.
-    /// @note Incurs no computational overhead.
-    __hostdev__ RealType time() const { return mT0; }
-
-    /// @brief Return the maximum time (parameterized along the Ray).
-    __hostdev__ RealType maxTime() const { return mT1; }
-
-    /// @brief Return the time (parameterized along the Ray) of the
-    /// second (i.e. next) hit of a tree node of size 2^Log2Dim.
-    /// @note Incurs a (small) computational overhead.
-    __hostdev__ RealType next() const
-    {
-        return Min(mT1, Min(mNext[0], Min(mNext[1], mNext[2])));
-    }
-
-    __hostdev__ int nextAxis() const
-    {
-        return nanovdb::MinIndex(mNext);
-    }
-
-private:
-    // helper to implement the general form
-    template<int axis>
-    __hostdev__ bool step()
-    {
-#ifdef ENFORCE_FORWARD_STEPPING
-        if (mNext[axis] <= mT0) {
-            mNext[axis] += mT0 - 0.999999f * mNext[axis] + 1.0e-6f;
-        }
-#endif
-        mT0 = mNext[axis];
-        mNext[axis] += mDelta[axis];
-        mVoxel[axis] += mStep[axis];
-        return mT0 <= mT1;
-    }
-
-    RealT  mT0, mT1; // min and max allowed times
-    CoordT mVoxel, mStep; // current voxel location and step to next voxel location
-    Vec3T  mDelta, mNext; // delta time and next time
-}; // class DDA
-
-/////////////////////////////////////////// ZeroCrossingNode ////////////////////////////////////////////
-
-template<typename RayT, typename NodeT>
-inline __hostdev__ bool ZeroCrossingNode(RayT& ray, const NodeT& node, float v0, nanovdb::Coord& ijk, float& v, float& t)
-{
-    BBox<Coord> bbox(node.origin(), node.origin() + Coord(node.dim() - 1));
-
-    if (!ray.clip(node.bbox())) {
-        return false;
-    }
-
-    const float t0 = ray.t0();
-
-    static const float Delta = 1.0001f;
-    ijk = Coord::Floor(ray(ray.t0() + Delta));
-
-    t = t0;
-    v = 0;
-
-    DDA<RayT, Coord, 1 << NodeT::LOG2DIM> dda(ray);
-    while (dda.step()) {
-        ijk = dda.voxel();
-
-        if (bbox.isInside(ijk) == false)
-            return false;
-
-        v = node.getValue(ijk);
-        if (v * v0 < 0) {
-            t = dda.time();
-            return true;
-        }
-    }
-    return false;
-}
-
-/////////////////////////////////////////// TreeMarcher ////////////////////////////////////////////
-
-/// @brief returns true if the ray intersects an active value at any level of the grid in the accessor.
-///        The empty-space ray-marching is performed at all levels of the tree using an
-///        HDDA. If an intersection is detected, then ijk is updated with the index coordinate of the first
-///        active voxel or tile, and t is set to the time of its intersection along the ray.
-template<typename RayT, typename AccT>
-inline __hostdev__ bool firstActive(RayT& ray, AccT& acc, Coord &ijk, float& t)
-{
-    if (!ray.clip(acc.root().bbox()) || ray.t1() > 1e20) {// clip ray to bbox
-        return false;// missed or undefined bbox
-    }
-    static const float Delta = 1.0001f;// forward step-size along the ray to avoid getting stuck
-    t = ray.t0();// initiate time
-    ijk = RoundDown<Coord>(ray.start()); // first voxel inside bbox
-    for (HDDA<RayT, Coord> hdda(ray, acc.getDim(ijk, ray)); !acc.isActive(ijk); hdda.update(ray, acc.getDim(ijk, ray))) {
-        if (!hdda.step()) return false;// leap-frog HDDA and exit if ray bound is exceeded
-        t = hdda.time() + Delta;// update time
-        ijk = RoundDown<Coord>( ray(t) );// update ijk
-    }
-    return true;
-}
-
-/////////////////////////////////////////// TreeMarcher ////////////////////////////////////////////
-
-/// @brief A Tree Marcher for Generic Grids
-
-template<typename NodeT, typename RayT, typename AccT, typename CoordT = Coord>
-class TreeMarcher
-{
-public:
-    using ChildT = typename NodeT::ChildNodeType;
-    using RealType = typename RayT::RealType;
-    using RealT = RealType;
-    using CoordType = CoordT;
-
-    inline __hostdev__ TreeMarcher(AccT& acc)
-        : mAcc(acc)
-    {
-    }
-
-    /// @brief Initialize the TreeMarcher with an index-space ray.
-    inline __hostdev__ bool init(const RayT& indexRay)
-    {
-        mRay = indexRay;
-        if (!mRay.clip(mAcc.root().bbox()))
-            return false; // clip ray to bbox
-
-        // tweak the intersection span into the bbox.
-        // CAVEAT: this will potentially clip some tiny corner intersections.
-        static const float Eps = 0.000001f;
-        const float        t0 = mRay.t0() + Eps;
-        const float        t1 = mRay.t1() - Eps;
-        if (t0 > t1)
-            return false;
-
-        const CoordT ijk = RoundDown<Coord>(mRay(t0));
-        const uint32_t    dim = mAcc.getDim(ijk, mRay);
-        mHdda.init(mRay, t0, t1, nanovdb::Max(dim, NodeT::dim()));
-
-        mT0 = (dim <= ChildT::dim()) ? mHdda.time() : -1; // potentially begin a span.
-        mTmax = t1;
-        return true;
-    }
-
-    /// @brief step the ray through the tree. If the ray hits a node then
-    /// populate t0 & t1, and the node.
-    /// @return true when a node of type NodeT is intersected, false otherwise.
-    inline __hostdev__ bool step(const NodeT** node, float& t0, float& t1)
-    {
-        // CAVEAT: if Delta is too large then it will clip corners of nodes in a visible way.
-        // but it has to be quite large when very far from the grid (due to fp32 rounding)
-        static const float Delta = 0.01f;
-        bool               hddaIsValid;
-
-        do {
-            t0 = mT0;
-
-            auto currentNode = mAcc.template getNode<NodeT>();
-
-            // get next node intersection...
-            hddaIsValid = mHdda.step();
-            const CoordT nextIjk = RoundDown<Coord>(mRay(mHdda.time() + Delta));
-            const auto   nextDim = mAcc.getDim(nextIjk, mRay);
-            mHdda.update(mRay, (int)Max(nextDim, NodeT::dim()));
-            mT0 = (nextDim <= ChildT::dim()) ? mHdda.time() : -1; // potentially begin a span.
-
-            if (t0 >= 0) { // we are in a span.
-                t1 = Min(mTmax, mHdda.time());
-
-                // TODO: clean this up!
-                if (t0 >= t1 || currentNode == nullptr)
-                    continue;
-
-                *node = currentNode;
-                return true;
-            }
-
-        } while (hddaIsValid);
-
-        return false;
-    }
-
-    inline __hostdev__ const RayT& ray() const { return mRay; }
-
-    inline __hostdev__ RayT& ray() { return mRay; }
-
-private:
-    AccT&             mAcc;
-    RayT              mRay;
-    HDDA<RayT, Coord> mHdda;
-    float             mT0;
-    float             mTmax;
-};// TreeMarcher
-
-/////////////////////////////////////////// PointTreeMarcher ////////////////////////////////////////////
-
-/// @brief A Tree Marcher for Point Grids
-///
-/// @note This class will handle correctly offseting the ray by 0.5 to ensure that
-/// the underlying HDDA will intersect with the grid-cells. See details below.
-
-template<typename AccT, typename RayT, typename CoordT = Coord>
-class PointTreeMarcher : public TreeMarcher<LeafNode<typename AccT::ValueType>, RayT, AccT, CoordT>
-{
-    using BaseT = TreeMarcher<LeafNode<typename AccT::ValueType>, RayT, AccT, CoordT>;
-public:
-    __hostdev__ PointTreeMarcher(AccT& acc) : BaseT(acc) {}
-
-    /// @brief Initiates this instance with a ray in index space.
-    ///
-    /// @details An offset by 0.5 is applied to the ray to account for the fact that points in vdb
-    ///          grids are bucketed into so-called grid cell, which are centered round grid voxels,
-    ///          whereas the DDA is based on so-called grid nodes, which are coincident with grid
-    ///          voxels. So, rather than offsettting the points by 0.5 to bring them into a grid
-    ///          node representation this method offsets the eye of the ray by 0.5, which effectively
-    ///          ensures that the DDA operates on grid cells as oppose to grid nodes. This subtle
-    ///          but important offset by 0.5 is explined in more details in our online documentation.
-    __hostdev__ bool init(RayT ray) { return BaseT::init(ray.offsetEye(0.5)); }
-};// PointTreeMarcher
-
-} // namespace nanovdb
-
-#endif // NANOVDB_HDDA_HAS_BEEN_INCLUDED
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/math/HDDA.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/math/HDDA.h instead.")
diff --git a/nanovdb/nanovdb/util/HostBuffer.h b/nanovdb/nanovdb/util/HostBuffer.h
index e0520d6983..b843eed478 100644
--- a/nanovdb/nanovdb/util/HostBuffer.h
+++ b/nanovdb/nanovdb/util/HostBuffer.h
@@ -1,595 +1,6 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-/*!
-    @file HostBuffer.h
-
-    @date April 20, 2021
-
-    @brief HostBuffer - a buffer that contains a shared or private bump
-           pool to either externally or internally managed host memory.
-
-    @details This HostBuffer can be used in multiple ways, most of which are
-             demonstrated in the examples below. Memory in the pool can
-             be managed or unmanged (e.g. internal or external) and can
-             be shared between multiple buffers or belong to a single buffer.
-
-   Example that uses HostBuffer::create inside io::readGrids to create a
-   full self-managed buffer, i.e. not shared and without padding, per grid in the file.
-   @code
-        auto handles = nanovdb::io::readGrids("file.nvdb");
-   @endcode
-
-   Example that uses HostBuffer::createFull. Assuming you have a raw pointer
-   to a NanoVDB grid of unknown type, this examples shows how to create its
-   GridHandle which can be used to enquire about the grid type and meta data.
-   @code
-        void    *data;// pointer to a NanoVDB grid of unknown type
-        uint64_t size;// byte size of NanoVDB grid of unknown type
-        auto buffer = nanovdb::HostBuffer::createFull(size, data);
-        nanovdb::GridHandle<> gridHandle(std::move(buffer));
-   @endcode
-
-   Example that uses HostBuffer::createPool for internally managed host memory.
-   Suppose you want to read multiple grids in multiple files, but reuse the same
-   fixed sized memory buffer to both avoid memory fragmentation as well as
-   exceeding the fixed memory ceiling!
-   @code
-        auto pool = nanovdb::HostBuffer::createPool(1 << 30);// 1 GB memory pool
-        std::vector<std::string>> frames;// vector of grid names
-        for (int i=0; i<frames.size(); ++i) {
-            auto handles = nanovdb::io::readGrids(frames[i], 0, pool);// throws if grids in file exceed 1 GB
-            ...
-            pool.reset();// clears all handles and resets the memory pool for reuse
-        }
-   @endcode
-
-   Example that uses HostBuffer::createPool for externally managed host memory.
-   Note that in this example @c handles are allowed to outlive @c pool since
-   they internally store a shared pointer to the memory pool. However @c data
-   MUST outlive @c handles since the pool does not own its memory in this example.
-   @code
-        const size_t poolSize = 1 << 30;// 1 GB
-        uint8_t *data = static_cast<uint8_t*>(std::malloc(size)+NANOVDB_DATA_ALIGNMENT);// 1 GB pool
-        uint8_t *buffer = nanovdb::alignPtr(data);// 32B aligned buffer
-        //uint8_t *buffer = std::aligned_alloc(NANOVDB_DATA_ALIGNMENT, poolSize);// in C++17
-        auto pool = nanovdb::HostBuffer::createPool(poolSize, buffer);
-        auto handles1 = nanovdb::io::readGrids("file1.nvdb", 0, pool);
-        auto handles2 = nanovdb::io::readGrids("file2.nvdb", 0, pool);
-        ....
-        std::free(data);
-   @endcode
-
-   Example that uses HostBuffer::createPool for externally managed host memory.
-   Note that in this example @c handles are allowed to outlive @c pool since
-   they internally store a shared pointer to the memory pool. However @c array
-   MUST outlive @c handles since the pool does not own its memory in this example.
-   @code
-        const size_t poolSize = 1 << 30;// 1 GB
-        std::unique_ptr<uint8_t[]> array(new uint8_t[size+NANOVDB_DATA_ALIGNMENT]);// scoped pool of 1 GB
-        //std::unique_ptr<uint8_t[]> array(std::aligned_alloc(NANOVDB_DATA_ALIGNMENT, size));// in C++17
-        uint8_t *buffer = nanovdb::alignPtr(array.get());// 32B aligned buffer
-        auto pool = nanovdb::HostBuffer::createPool(poolSize, buffer);
-        auto handles = nanovdb::io::readGrids("file.nvdb", 0, pool);
-   @endcode
-*/
-
-#ifndef NANOVDB_HOSTBUFFER_H_HAS_BEEN_INCLUDED
-#define NANOVDB_HOSTBUFFER_H_HAS_BEEN_INCLUDED
-
-#include <nanovdb/NanoVDB.h>// for NANOVDB_DATA_ALIGNMENT;
-#include <stdint.h> //         for types like int32_t etc
-#include <cstdio> //           for fprintf
-#include <cstdlib> //          for std::malloc/std::realloc/std::free
-#include <memory>//            for std::make_shared
-#include <mutex>//             for std::mutex
-#include <unordered_set>//     for std::unordered_set
-#include <cassert>//           for assert
-#include <sstream>//           for std::stringstream
-#include <cstring>//           for memcpy
-
-#define checkPtr(ptr, msg) \
-    { \
-        ptrAssert((ptr), (msg), __FILE__, __LINE__); \
-    }
-
-namespace nanovdb {
-
-template<typename BufferT>
-struct BufferTraits
-{
-    static constexpr bool hasDeviceDual = false;
-};
-
-// ----------------------------> HostBuffer <--------------------------------------
-
-/// @brief This is a buffer that contains a shared or private pool
-///        to either externally or internally managed host memory.
-///
-/// @note  Terminology:
-///        Pool:   0 = buffer.size() < buffer.poolSize()
-///        Buffer: 0 < buffer.size() < buffer.poolSize()
-///        Full:   0 < buffer.size() = buffer.poolSize()
-///        Empty:  0 = buffer.size() = buffer.poolSize()
-class HostBuffer
-{
-    struct Pool;// forward declaration of private pool struct
-    std::shared_ptr<Pool> mPool;
-    uint64_t              mSize; // total number of bytes for the NanoVDB grid.
-    uint8_t*              mData; // raw buffer for the NanoVDB grid.
-
-#if defined(DEBUG) || defined(_DEBUG)
-    static inline void ptrAssert(void* ptr, const char* msg, const char* file, int line, bool abort = true)
-    {
-        if (ptr == nullptr) {
-            fprintf(stderr, "NULL pointer error: %s %s %d\n", msg, file, line);
-            if (abort)
-                exit(1);
-        }
-        if (uint64_t(ptr) % NANOVDB_DATA_ALIGNMENT) {
-            fprintf(stderr, "Alignment pointer error: %s %s %d\n", msg, file, line);
-            if (abort)
-                exit(1);
-        }
-    }
-#else
-    static inline void ptrAssert(void*, const char*, const char*, int, bool = true)
-    {
-    }
-#endif
-
-public:
-    /// @brief Return a full buffer or an empty buffer
-    HostBuffer(uint64_t bufferSize = 0);
-
-     /// @brief Move copy-constructor
-    HostBuffer(HostBuffer&& other);
-
-    /// @brief Custom descructor
-    ~HostBuffer() { this->clear(); }
-
-    /// @brief Move copy assignment operation
-    HostBuffer& operator=(HostBuffer&& other);
-
-    /// @brief Disallow copy-construction
-    HostBuffer(const HostBuffer&) = delete;
-
-    /// @brief Disallow copy assignment operation
-    HostBuffer& operator=(const HostBuffer&) = delete;
-
-    /// @brief Return a pool buffer which satisfies: buffer.size == 0,
-    ///        buffer.poolSize() == poolSize, and buffer.data() == nullptr.
-    ///        If data==nullptr, memory for the pool will be allocated.
-    ///
-    /// @throw If poolSize is zero.
-    static HostBuffer createPool(uint64_t poolSize, void *data = nullptr);
-
-    /// @brief Return a full buffer which satisfies: buffer.size == bufferSize,
-    ///        buffer.poolSize() == bufferSize, and buffer.data() == data.
-    ///        If data==nullptr, memory for the pool will be allocated.
-    ///
-    /// @throw If bufferSize is zero.
-    static HostBuffer createFull(uint64_t bufferSize, void *data = nullptr);
-
-    /// @brief Return a buffer with @c bufferSize bytes managed by
-    ///        the specified memory @c pool. If none is provided, i.e.
-    ///        @c pool == nullptr or @c pool->poolSize() == 0, one is
-    ///        created with size @c bufferSize, i.e. a full buffer is returned.
-    ///
-    /// @throw If the specified @c pool has insufficient memory for
-    ///        the requested buffer size.
-    static HostBuffer create(uint64_t bufferSize, const HostBuffer* pool = nullptr);
-
-    /// @brief Initialize as a full buffer with the specified size. If data is NULL
-    ///        the memory is internally allocated.
-    void init(uint64_t bufferSize, void *data = nullptr);
-
-    //@{
-    /// @brief Retuns a pointer to the raw memory buffer managed by this allocator.
-    ///
-    /// @warning Note that the pointer can be NULL if the allocator was not initialized!
-    const uint8_t* data() const { return mData; }
-    uint8_t* data() { return mData; }
-    //@}
-
-    //@{
-    /// @brief Returns the size in bytes associated with this buffer.
-    uint64_t bufferSize() const { return mSize; }
-    uint64_t size() const { return this->bufferSize(); }
-    //@}
-
-    /// @brief Returns the size in bytes of the memory pool shared with this instance.
-    uint64_t poolSize() const;
-
-    /// @brief Return true if memory is managed (using std::malloc and std:free) by the
-    ///        shared pool in this buffer. Else memory is assumed to be managed externally.
-    bool isManaged() const;
-
-    //@{
-    /// @brief Returns true if this buffer has no memory associated with it
-    bool isEmpty() const { return !mPool || mSize == 0 || mData == nullptr; }
-    bool empty() const { return this->isEmpty(); }
-    //@}
-
-    /// @brief Return true if this is a pool, i.e. an empty buffer with a nonempty
-    ///        internal pool, i.e. this->size() == 0 and this->poolSize() != 0
-    bool isPool() const { return mSize == 0 && this->poolSize() > 0; }
-
-    /// @brief Return true if the pool exists, is nonempty but has no more available memory
-    bool isFull() const;
-
-    /// @brief Clear this buffer so it is empty.
-    void clear();
-
-    /// @brief Clears all existing buffers that are registered against the memory pool
-    ///        and resets the pool so it can be reused to create new buffers.
-    ///
-    /// @throw If this instance is not empty or contains no pool.
-    ///
-    /// @warning This method is not thread-safe!
-    void reset();
-
-    /// @brief Total number of bytes from the pool currently in use by buffers
-    uint64_t poolUsage() const;
-
-    /// @brief resize the pool size. It will attempt to resize the existing
-    ///        memory block, but if that fails a deep copy is performed.
-    ///        If @c data is not NULL it will be used as new externally
-    ///        managed memory for the pool. All registered buffers are
-    ///        updated so GridHandle::grid might return a new address (if
-    ///        deep copy was performed).
-    ///
-    /// @note  This method can be use to resize the memory pool and even
-    ///        change it from internally to externally managed memory or vice versa.
-    ///
-    /// @throw if @c poolSize is less than this->poolUsage() the used memory
-    ///        or allocations fail.
-    void resizePool(uint64_t poolSize, void *data = nullptr);
-
-}; // HostBuffer class
-
-// --------------------------> Implementation of HostBuffer::Pool <------------------------------------
-
-// This is private struct of HostBuffer so you can safely ignore the API
-struct HostBuffer::Pool
-{
-    using HashTableT = std::unordered_set<HostBuffer*>;
-    std::mutex mMutex; // mutex for updating mRegister and mFree
-    HashTableT mRegister;
-    uint8_t*   mData;
-    uint8_t*   mFree;
-    uint64_t   mSize;
-    uint64_t   mPadding;
-    bool       mManaged;
-
-    /// @brief External memory ctor
-    Pool(uint64_t size = 0, void* data = nullptr)
-        : mData((uint8_t*)data)
-        , mFree(mData)
-        , mSize(size)
-        , mPadding(0)
-        , mManaged(data == nullptr)
-    {
-        if (mManaged) {
-            mData = static_cast<uint8_t*>(Pool::alloc(mSize));
-            if (mData == nullptr) {
-                throw std::runtime_error("Pool::Pool malloc failed");
-            }
-        }
-        mPadding = alignmentPadding(mData);
-        if (!mManaged && mPadding != 0) {
-            throw std::runtime_error("Pool::Pool: external memory buffer is not aligned to " +
-                                     std::to_string(NANOVDB_DATA_ALIGNMENT) +
-                                     " bytes.\nHint: use nanovdb::alignPtr or std::aligned_alloc (C++17 only)");
-        }
-        mFree = mData + mPadding;
-    }
-
-    /// @brief Custom destructor
-    ~Pool()
-    {
-        assert(mRegister.empty());
-        if (mManaged) {
-            std::free(mData);
-        }
-    }
-
-    /// @brief Disallow copy-construction
-    Pool(const Pool&) = delete;
-
-    /// @brief Disallow move-construction
-    Pool(const Pool&&) = delete;
-
-    /// @brief Disallow copy assignment operation
-    Pool& operator=(const Pool&) = delete;
-
-    /// @brief Disallow move assignment operation
-    Pool& operator=(const Pool&&) = delete;
-
-    /// @brief Return the total number of bytes used from this Pool by buffers
-    uint64_t usage() const { return static_cast<uint64_t>(mFree - mData) - mPadding; }
-
-    /// @brief Allocate a buffer of the specified size and add it to the register
-    void add(HostBuffer* buffer, uint64_t size)
-    {
-        auto* alignedFree = mFree + alignmentPadding(mFree);
-
-        if (alignedFree + size > mData + mPadding + mSize) {
-            std::stringstream ss;
-            ss << "HostBuffer::Pool: insufficient memory\n"
-               << "\tA buffer requested " << size << " bytes with " << NANOVDB_DATA_ALIGNMENT
-               << "-bytes alignment from a pool with "
-               << mSize << " bytes of which\n\t" << (alignedFree - mData - mPadding)
-               << " bytes are used by " << mRegister.size() << " other buffer(s). "
-               << "Pool is " << (mManaged ? "internally" : "externally") << " managed.\n";
-            //std::cerr << ss.str();
-            throw std::runtime_error(ss.str());
-        }
-        buffer->mSize = size;
-        const std::lock_guard<std::mutex> lock(mMutex);
-        mRegister.insert(buffer);
-        buffer->mData = alignedFree;
-        mFree = alignedFree + size;
-    }
-
-    /// @brief Remove the specified buffer from the register
-    void remove(HostBuffer *buffer)
-    {
-        const std::lock_guard<std::mutex> lock(mMutex);
-        mRegister.erase(buffer);
-    }
-
-    /// @brief Replaces buffer1 with buffer2 in the register
-    void replace(HostBuffer *buffer1, HostBuffer *buffer2)
-    {
-        const std::lock_guard<std::mutex> lock(mMutex);
-        mRegister.erase( buffer1);
-        mRegister.insert(buffer2);
-    }
-
-    /// @brief Reset the register and all its buffers
-    void reset()
-    {
-        for (HostBuffer *buffer : mRegister) {
-            buffer->mPool.reset();
-            buffer->mSize = 0;
-            buffer->mData = nullptr;
-        }
-        mRegister.clear();
-        mFree = mData + mPadding;
-    }
-
-    /// @brief Resize this Pool and update registered buffers as needed. If data is no NULL
-    ///        it is used as externally managed memory.
-    void resize(uint64_t size, void *data = nullptr)
-    {
-        const uint64_t memUsage = this->usage();
-
-        const bool managed = (data == nullptr);
-
-        if (!managed && alignmentPadding(data) != 0) {
-            throw std::runtime_error("Pool::resize: external memory buffer is not aligned to " +
-                                     std::to_string(NANOVDB_DATA_ALIGNMENT) + " bytes");
-        }
-
-        if (memUsage > size) {
-            throw std::runtime_error("Pool::resize: insufficient memory");
-        }
-
-        uint64_t padding = 0;
-        if (mManaged && managed && size != mSize) { // managed -> managed
-            padding = mPadding;
-            data = Pool::realloc(mData, memUsage, size, padding); // performs both copy and free of mData
-        } else if (!mManaged && managed) { // un-managed -> managed
-            data = Pool::alloc(size);
-            padding = alignmentPadding(data);
-        }
-
-        if (data == nullptr) {
-            throw std::runtime_error("Pool::resize: allocation failed");
-        } else if (data != mData) {
-            auto* paddedData = static_cast<uint8_t*>(data) + padding;
-
-            if (!(mManaged && managed)) { // no need to copy if managed -> managed
-                memcpy(paddedData, mData + mPadding, memUsage);
-            }
-
-            for (HostBuffer* buffer : mRegister) { // update registered buffers
-                buffer->mData = paddedData + ptrdiff_t(buffer->mData - (mData + mPadding));
-            }
-            mFree = paddedData + memUsage; // update the free pointer
-            if (mManaged && !managed) {// only free if managed -> un-managed
-                std::free(mData);
-            }
-
-            mData = static_cast<uint8_t*>(data);
-            mPadding = padding;
-        }
-        mSize    = size;
-        mManaged = managed;
-    }
-    /// @brief Return true is all the memory in this pool is in use.
-    bool isFull() const
-    {
-        assert(mFree <= mData + mPadding + mSize);
-        return mSize > 0 ? mFree == mData + mPadding + mSize : false;
-    }
-
-private:
-
-    static void* alloc(uint64_t size)
-    {
-//#if (__cplusplus >= 201703L)
-//    return std::aligned_alloc(NANOVDB_DATA_ALIGNMENT, size);//C++17 or newer
-//#else
-    // make sure we alloc enough space to align the result
-    return std::malloc(size + NANOVDB_DATA_ALIGNMENT);
-//#endif
-    }
-
-    static void* realloc(void* const origData,
-                         uint64_t    origSize,
-                         uint64_t    desiredSize,
-                         uint64_t&   padding)
-    {
-        // make sure we alloc enough space to align the result
-        void* data = std::realloc(origData, desiredSize + NANOVDB_DATA_ALIGNMENT);
-
-        if (data != nullptr && data != origData) {
-            uint64_t newPadding = alignmentPadding(data);
-            // Number of padding bytes may have changed -- move data if that's the case
-            if (newPadding != padding) {
-                // Realloc should not happen when shrinking down buffer, but let's be safe
-                std::memmove(static_cast<uint8_t*>(data) + newPadding,
-                             static_cast<uint8_t*>(data) + padding,
-                             Min(origSize, desiredSize));
-                padding = newPadding;
-            }
-        }
-
-        return data;
-    }
-
-};// struct HostBuffer::Pool
-
-// --------------------------> Implementation of HostBuffer <------------------------------------
-
-inline HostBuffer::HostBuffer(uint64_t size) : mPool(nullptr), mSize(size), mData(nullptr)
-{
-    if (size>0) {
-        mPool = std::make_shared<Pool>(size);
-        mData = mPool->mFree;
-        mPool->mRegister.insert(this);
-        mPool->mFree += size;
-    }
-}
-
-inline HostBuffer::HostBuffer(HostBuffer&& other) : mPool(other.mPool), mSize(other.mSize), mData(other.mData)
-{
-    if (mPool && mSize != 0) {
-        mPool->replace(&other, this);
-    }
-    other.mPool.reset();
-    other.mSize = 0;
-    other.mData = nullptr;
-}
-
-inline void HostBuffer::init(uint64_t bufferSize, void *data)
-{
-    if (bufferSize == 0) {
-        throw std::runtime_error("HostBuffer: invalid buffer size");
-    }
-    if (mPool) {
-        mPool.reset();
-    }
-    if (!mPool || mPool->mSize != bufferSize) {
-        mPool = std::make_shared<Pool>(bufferSize, data);
-    }
-    mPool->add(this, bufferSize);
-}
-
-inline HostBuffer& HostBuffer::operator=(HostBuffer&& other)
-{
-    if (mPool) {
-        mPool->remove(this);
-    }
-    mPool = other.mPool;
-    mSize = other.mSize;
-    mData = other.mData;
-    if (mPool && mSize != 0) {
-        mPool->replace(&other, this);
-    }
-    other.mPool.reset();
-    other.mSize = 0;
-    other.mData = nullptr;
-    return *this;
-}
-
-inline uint64_t HostBuffer::poolSize() const
-{
-    return mPool ? mPool->mSize : 0u;
-}
-
-inline uint64_t HostBuffer::poolUsage() const
-{
-    return mPool ? mPool->usage(): 0u;
-}
-
-inline bool HostBuffer::isManaged() const
-{
-    return mPool ? mPool->mManaged : false;
-}
-
-inline bool HostBuffer::isFull() const
-{
-    return mPool ? mPool->isFull() : false;
-}
-
-inline HostBuffer HostBuffer::createPool(uint64_t poolSize, void *data)
-{
-    if (poolSize == 0) {
-        throw std::runtime_error("HostBuffer: invalid pool size");
-    }
-    HostBuffer buffer;
-    buffer.mPool = std::make_shared<Pool>(poolSize, data);
-    // note the buffer is NOT registered by its pool since it is not using its memory
-    buffer.mSize = 0;
-    buffer.mData = nullptr;
-    return buffer;
-}
-
-inline HostBuffer HostBuffer::createFull(uint64_t bufferSize, void *data)
-{
-    if (bufferSize == 0) {
-        throw std::runtime_error("HostBuffer: invalid buffer size");
-    }
-    HostBuffer buffer;
-    buffer.mPool = std::make_shared<Pool>(bufferSize, data);
-    buffer.mPool->add(&buffer, bufferSize);
-    return buffer;
-}
-
-inline HostBuffer HostBuffer::create(uint64_t bufferSize, const HostBuffer* pool)
-{
-    HostBuffer buffer;
-    if (pool == nullptr || !pool->mPool) {
-        buffer.mPool = std::make_shared<Pool>(bufferSize);
-    } else {
-       buffer.mPool = pool->mPool;
-    }
-    buffer.mPool->add(&buffer, bufferSize);
-    return buffer;
-}
-
-inline void HostBuffer::clear()
-{
-    if (mPool) {// remove self from the buffer register in the pool
-        mPool->remove(this);
-    }
-    mPool.reset();
-    mSize = 0;
-    mData = nullptr;
-}
-
-inline void HostBuffer::reset()
-{
-    if (this->size()>0) {
-        throw std::runtime_error("HostBuffer: only empty buffers can call reset");
-    }
-    if (!mPool) {
-        throw std::runtime_error("HostBuffer: this buffer contains no pool to reset");
-    }
-    mPool->reset();
-}
-
-inline void HostBuffer::resizePool(uint64_t size, void *data)
-{
-    if (!mPool) {
-        throw std::runtime_error("HostBuffer: this buffer contains no pool to resize");
-    }
-    mPool->resize(size, data);
-}
-
-} // namespace nanovdb
-
-#endif // end of NANOVDB_HOSTBUFFER_H_HAS_BEEN_INCLUDED
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/HostBuffer.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/HostBuffer.h instead.")
diff --git a/nanovdb/nanovdb/util/IO.h b/nanovdb/nanovdb/util/IO.h
index 5d51cb53c6..49d51e4f24 100644
--- a/nanovdb/nanovdb/util/IO.h
+++ b/nanovdb/nanovdb/util/IO.h
@@ -1,796 +1,6 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-/*!
-    \file IO.h
-
-    \author Ken Museth
-
-    \date May 1, 2020
-
-    \brief Implements I/O for NanoVDB grids. Features optional BLOSC and ZIP
-           file compression, support for multiple grids per file as well as
-           multiple grid types.
-
-    \note  This file does NOT depend on OpenVDB, but optionally on ZIP and BLOSC
-
-    \details NanoVDB files take on of two formats:
-             1) multiple segments each with multiple grids (segments have easy to access metadata about its grids)
-             2) starting with verion 32.6.0 nanovdb files also support a raw buffer with one or more grids (just a
-             dump of a raw grid buffer, so no new metadata).
-
-    // 1: Segment:  FileHeader, MetaData0, gridName0...MetaDataN, gridNameN, compress Grid0,...compressed GridN
-    // 2: Raw: Grid0,...GridN
-*/
-
-#ifndef NANOVDB_IO_H_HAS_BEEN_INCLUDED
-#define NANOVDB_IO_H_HAS_BEEN_INCLUDED
-
-#include <nanovdb/NanoVDB.h>
-#include "GridHandle.h"
-#include "GridChecksum.h"// for updateGridCount
-
-#include <fstream> // for std::ifstream
-#include <iostream> // for std::cerr/cout
-#include <string> // for std::string
-#include <sstream> // for std::stringstream
-#include <cstring> // for std::strcmp
-#include <memory> // for std::unique_ptr
-#include <vector> // for std::vector
-#ifdef NANOVDB_USE_ZIP
-#include <zlib.h> // for ZIP compression
-#endif
-#ifdef NANOVDB_USE_BLOSC
-#include <blosc.h> // for BLOSC compression
-#endif
-
-// Due to a bug in older versions of gcc, including fstream might
-// define "major" and "minor" which are used as member data below.
-// See https://bugzilla.redhat.com/show_bug.cgi?id=130601
-#if defined(major) || defined(minor)
-#undef major
-#undef minor
-#endif
-
-namespace nanovdb {
-
-namespace io {
-
-// --------------------------> writeGrid(s) <------------------------------------
-
-/// @brief Write a single grid to file (over-writing existing content of the file)
-template<typename BufferT>
-void writeGrid(const std::string& fileName, const GridHandle<BufferT>& handle, io::Codec codec = io::Codec::NONE, int verbose = 0);
-
-/// @brief Write multiple grids to file (over-writing existing content of the file)
-template<typename BufferT = HostBuffer, template<typename...> class VecT = std::vector>
-void writeGrids(const std::string& fileName, const VecT<GridHandle<BufferT>>& handles, Codec codec = Codec::NONE, int verbose = 0);
-
-// --------------------------> readGrid(s) <------------------------------------
-
-/// @brief Read and return one or all grids from a file into a single GridHandle
-/// @tparam BufferT Type of buffer used memory allocation
-/// @param fileName string name of file to be read from
-/// @param n zero-based signed index of the grid to be read.
-///          The default value of 0 means read only first grid.
-///          A negative value of n means read all grids in the file.
-/// @param verbose specify verbosity level. Default value of zero means quiet.
-/// @param buffer optional buffer used for memory allocation
-/// @return return a single GridHandle with one or all grids found in the file
-/// @throw will throw a std::runtime_error if the file does not contain a grid with index n
-template<typename BufferT = HostBuffer>
-GridHandle<BufferT> readGrid(const std::string& fileName, int n = 0, int verbose = 0, const BufferT& buffer = BufferT());
-
-/// @brief Read and return the first grid with a specific name from a file
-/// @tparam BufferT Type of buffer used memory allocation
-/// @param fileName string name of file to be read from
-/// @param gridName string name of the grid to be read
-/// @param verbose specify verbosity level. Default value of zero means quiet.
-/// @param buffer  optional buffer used for memory allocation
-/// @return return a single GridHandle containing the grid with the specific name
-/// @throw will throw a std::runtime_error if the file does not contain a grid with the specific name
-template<typename BufferT = HostBuffer>
-GridHandle<BufferT> readGrid(const std::string& fileName, const std::string& gridName, int verbose = 0, const BufferT& buffer = BufferT());
-
-/// @brief Read all the grids in the file and return them as a vector of multiple GridHandles, each containing
-///        all grids encoded in the same segment of the file (i.e. they where written together)
-/// @tparam BufferT Type of buffer used memory allocation
-/// @param fileName string name of file to be read from
-/// @param verbose specify verbosity level. Default value of zero means quiet.
-/// @param buffer  optional buffer used for memory allocation
-/// @return Return a vector of GridHandles each containing all grids encoded
-///         in the same segment of the file (i.e. they where written together).
-template<typename BufferT = HostBuffer, template<typename...> class VecT = std::vector>
-VecT<GridHandle<BufferT>> readGrids(const std::string& fileName, int verbose = 0, const BufferT& buffer = BufferT());
-
-// -----------------------------------------------------------------------
-
-/// We fix a specific size for counting bytes in files so that they
-/// are saved the same regardless of machine precision.  (Note there are
-/// still little/bigendian issues, however)
-using fileSize_t = uint64_t;
-
-/// @brief Internal functions for compressed read/write of a NanoVDB GridHandle into a stream
-///
-/// @warning These functions should never be called directly by client code
-namespace Internal {
-static constexpr fileSize_t MAX_SIZE = 1UL << 30; // size is 1 GB
-
-template<typename BufferT>
-static fileSize_t write(std::ostream& os, const GridHandle<BufferT>& handle, Codec codec, uint32_t n);
-
-template<typename BufferT>
-static void read(std::istream& is, BufferT& buffer, Codec codec);
-
-static void read(std::istream& is, char* data, fileSize_t size, Codec codec);
-} // namespace Internal
-
-/// @brief Standard hash function to use on strings; std::hash may vary by
-///        platform/implementation and is know to produce frequent collisions.
-uint64_t stringHash(const char* cstr);
-
-/// @brief Return a uint64_t hash key of a std::string
-inline uint64_t stringHash(const std::string& str){return stringHash(str.c_str());}
-
-/// @brief Return a uint64_t with its bytes reversed so we can check for endianness
-inline uint64_t reverseEndianness(uint64_t val)
-{
-    return (((val) >> 56) & 0x00000000000000FF) | (((val) >> 40) & 0x000000000000FF00) |
-           (((val) >> 24) & 0x0000000000FF0000) | (((val) >>  8) & 0x00000000FF000000) |
-           (((val) <<  8) & 0x000000FF00000000) | (((val) << 24) & 0x0000FF0000000000) |
-           (((val) << 40) & 0x00FF000000000000) | (((val) << 56) & 0xFF00000000000000);
-}
-
-/// @brief This class defines the meta data stored for each grid in a segment
-///
-/// @details A segment consists of a FileHeader followed by a list of FileGridMetaData
-///          each followed by grid names and then finally the grids themselves.
-///
-/// @note This class should not be confused with nanovdb::GridMetaData defined in NanoVDB.h
-///       Also, FileMetaData is defined in NanoVDB.h.
-struct FileGridMetaData : public FileMetaData
-{
-    static_assert(sizeof(FileMetaData) == 176, "Unexpected sizeof(FileMetaData)");
-    std::string gridName;
-    void        read(std::istream& is);
-    void        write(std::ostream& os) const;
-    FileGridMetaData() {}
-    template<typename ValueT>
-    FileGridMetaData(uint64_t size, Codec c, const NanoGrid<ValueT>& grid);
-    uint64_t memUsage() const { return sizeof(FileMetaData) + nameSize; }
-}; // FileGridMetaData
-
-/// @brief This class defines all the data stored in segment of a file
-///
-/// @details A segment consists of a FileHeader followed by a list of FileGridMetaData
-///          each followed by grid names and then finally the grids themselves.
-struct Segment
-{
-    // Check assumptions made during read and write of FileHeader and FileMetaData
-    static_assert(sizeof(FileHeader) == 16u, "Unexpected sizeof(FileHeader)");
-    FileHeader header;// defined in NanoVDB.h
-    std::vector<FileGridMetaData> meta;// defined in NanoVDB.h
-    Segment(Codec c = Codec::NONE)
-#ifdef NANOVDB_USE_NEW_MAGIC_NUMBERS
-        : header{NANOVDB_MAGIC_FILE, Version(), 0u, c}
-#else
-        : header{NANOVDB_MAGIC_NUMBER, Version(), 0u, c}
-#endif
-        , meta()
-    {
-    }
-    template<typename BufferT>
-    void     add(const GridHandle<BufferT>& h);
-    bool     read(std::istream& is);
-    void     write(std::ostream& os) const;
-    uint64_t memUsage() const;
-}; // Segment
-
-/// @brief Return true if the file contains a grid with the specified name
-bool hasGrid(const std::string& fileName, const std::string& gridName);
-
-/// @brief Return true if the stream contains a grid with the specified name
-bool hasGrid(std::istream& is, const std::string& gridName);
-
-/// @brief Reads and returns a vector of meta data for all the grids found in the specified file
-std::vector<FileGridMetaData> readGridMetaData(const std::string& fileName);
-
-/// @brief Reads and returns a vector of meta data for all the grids found in the specified stream
-std::vector<FileGridMetaData> readGridMetaData(std::istream& is);
-
-// --------------------------> Implementations for Internal <------------------------------------
-
-template<typename BufferT>
-fileSize_t Internal::write(std::ostream& os, const GridHandle<BufferT>& handle, Codec codec, unsigned int n)
-{
-    const char* data = reinterpret_cast<const char*>(handle.gridData(n));
-    fileSize_t  total = 0, residual = handle.gridSize(n);
-
-    switch (codec) {
-    case Codec::ZIP: {
-#ifdef NANOVDB_USE_ZIP
-        uLongf                   size = compressBound(residual); // Get an upper bound on the size of the compressed data.
-        std::unique_ptr<Bytef[]> tmp(new Bytef[size]);
-        const int                status = compress(tmp.get(), &size, reinterpret_cast<const Bytef*>(data), residual);
-        if (status != Z_OK)
-            std::runtime_error("Internal write error in ZIP");
-        if (size > residual)
-            std::cerr << "\nWarning: Unexpected ZIP compression from " << residual << " to " << size << " bytes\n";
-        const fileSize_t outBytes = size;
-        os.write(reinterpret_cast<const char*>(&outBytes), sizeof(fileSize_t));
-        os.write(reinterpret_cast<const char*>(tmp.get()), outBytes);
-        total += sizeof(fileSize_t) + outBytes;
-#else
-        throw std::runtime_error("ZIP compression codec was disabled during build");
-#endif
-        break;
-    }
-    case Codec::BLOSC: {
-#ifdef NANOVDB_USE_BLOSC
-        do {
-            fileSize_t              chunk = residual < MAX_SIZE ? residual : MAX_SIZE, size = chunk + BLOSC_MAX_OVERHEAD;
-            std::unique_ptr<char[]> tmp(new char[size]);
-            const int               count = blosc_compress_ctx(9, 1, sizeof(float), chunk, data, tmp.get(), size, BLOSC_LZ4_COMPNAME, 1 << 18, 1);
-            if (count <= 0)
-                std::runtime_error("Internal write error in BLOSC");
-            const fileSize_t outBytes = count;
-            os.write(reinterpret_cast<const char*>(&outBytes), sizeof(fileSize_t));
-            os.write(reinterpret_cast<const char*>(tmp.get()), outBytes);
-            total += sizeof(fileSize_t) + outBytes;
-            data += chunk;
-            residual -= chunk;
-        } while (residual > 0);
-#else
-        throw std::runtime_error("BLOSC compression codec was disabled during build");
-#endif
-        break;
-    }
-    default:
-        os.write(data, residual);
-        total += residual;
-    }
-    if (!os) throw std::runtime_error("Failed to write Tree to file");
-    return total;
-} // Internal::write
-
-template<typename BufferT>
-void Internal::read(std::istream& is, BufferT& buffer, Codec codec)
-{
-    Internal::read(is, reinterpret_cast<char*>(buffer.data()), buffer.size(), codec);
-} // Internal::read
-
-/// @brief read compressed grid from stream
-/// @param is input stream to read from
-/// @param data data buffer to write into
-/// @param residual expected size of uncompressed data
-/// @param codec mode of compression
-void Internal::read(std::istream& is, char* data, fileSize_t residual, Codec codec)
-{
-    // read tree using optional compression
-    switch (codec) {
-    case Codec::ZIP: {
-#ifdef NANOVDB_USE_ZIP
-        fileSize_t size;
-        is.read(reinterpret_cast<char*>(&size), sizeof(fileSize_t));
-        std::unique_ptr<Bytef[]> tmp(new Bytef[size]);
-        is.read(reinterpret_cast<char*>(tmp.get()), size);
-        uLongf numBytes = residual;
-        int status = uncompress(reinterpret_cast<Bytef*>(data), &numBytes, tmp.get(), static_cast<uLongf>(size));
-        if (status != Z_OK) std::runtime_error("Internal read error in ZIP");
-        if (fileSize_t(numBytes) != residual) throw std::runtime_error("UNZIP failed on byte size");
-#else
-        throw std::runtime_error("ZIP compression codec was disabled during build");
-#endif
-        break;
-    }
-    case Codec::BLOSC: {
-#ifdef NANOVDB_USE_BLOSC
-        do {
-            fileSize_t size;
-            is.read(reinterpret_cast<char*>(&size), sizeof(fileSize_t));
-            std::unique_ptr<char[]> tmp(new char[size]);
-            is.read(reinterpret_cast<char*>(tmp.get()), size);
-            const fileSize_t chunk = residual < MAX_SIZE ? residual : MAX_SIZE;
-            const int        count = blosc_decompress_ctx(tmp.get(), data, size_t(chunk), 1); //fails with more threads :(
-            if (count < 1)
-                std::runtime_error("Internal read error in BLOSC");
-            if (count != int(chunk))
-                throw std::runtime_error("BLOSC failed on byte size");
-            data += size_t(chunk);
-            residual -= chunk;
-        } while (residual > 0);
-#else
-        throw std::runtime_error("BLOSC compression codec was disabled during build");
-#endif
-        break;
-    }
-    default:
-        is.read(data, residual);// read uncompressed data
-    }
-    if (!is) throw std::runtime_error("Failed to read Tree from file");
-} // Internal::read
-
-// --------------------------> Implementations for FileGridMetaData <------------------------------------
-
-template<typename ValueT>
-inline FileGridMetaData::FileGridMetaData(uint64_t size, Codec c, const NanoGrid<ValueT>& grid)
-    : FileMetaData{size, // gridSize
-                   size, // fileSize (will typically be redefined)
-                   0u, // nameKey
-                   grid.activeVoxelCount(), // voxelCount
-                   grid.gridType(), // gridType
-                   grid.gridClass(), // gridClass
-                   grid.worldBBox(), // worldBBox
-                   grid.tree().bbox(), // indexBBox
-                   grid.voxelSize(), // voxelSize
-                   0, // nameSize
-                   {0, 0, 0, 1}, // nodeCount[4]
-                   {0, 0, 0}, // tileCount[3]
-                   c, // codec
-                   0, // padding
-                   Version()}// version
-    , gridName(grid.gridName())
-{
-    nameKey = stringHash(gridName);
-    nameSize = static_cast<uint32_t>(gridName.size() + 1); // include '\0'
-    const uint32_t* ptr = reinterpret_cast<const TreeData*>(&grid.tree())->mNodeCount;
-    for (int i = 0; i < 3; ++i) FileMetaData::nodeCount[i] = *ptr++;
-    for (int i = 0; i < 3; ++i) FileMetaData::tileCount[i] = *ptr++;
-}// FileGridMetaData::FileGridMetaData
-
-inline void FileGridMetaData::write(std::ostream& os) const
-{
-    os.write(reinterpret_cast<const char*>(this), sizeof(FileMetaData));
-    os.write(gridName.c_str(), nameSize);
-    if (!os) throw std::runtime_error("Failed writing FileGridMetaData");
-}// FileGridMetaData::write
-
-inline void FileGridMetaData::read(std::istream& is)
-{
-    is.read(reinterpret_cast<char*>(this), sizeof(FileMetaData));
-    std::unique_ptr<char[]> tmp(new char[nameSize]);
-    is.read(reinterpret_cast<char*>(tmp.get()), nameSize);
-    gridName.assign(tmp.get());
-    if (!is) throw std::runtime_error("Failed reading FileGridMetaData");
-}// FileGridMetaData::read
-
-// --------------------------> Implementations for Segment <------------------------------------
-
-inline uint64_t Segment::memUsage() const
-{
-    uint64_t sum = sizeof(FileHeader);
-    for (auto& m : meta) sum += m.memUsage();// includes FileMetaData + grid name
-    return sum;
-}// Segment::memUsage
-
-template<typename BufferT>
-inline void Segment::add(const GridHandle<BufferT>& h)
-{
-    for (uint32_t i = 0; i < h.gridCount(); ++i) {
-        if (auto* grid = h.template grid<float>(i)) { // most common
-            meta.emplace_back(h.gridSize(i), header.codec, *grid);
-        } else if (auto* grid = h.template grid<Vec3f>(i)) {
-            meta.emplace_back(h.gridSize(i), header.codec, *grid);
-        } else if (auto* grid = h.template grid<double>(i)) {
-            meta.emplace_back(h.gridSize(i), header.codec, *grid);
-        } else if (auto* grid = h.template grid<int32_t>(i)) {
-            meta.emplace_back(h.gridSize(i), header.codec, *grid);
-        } else if (auto* grid = h.template grid<uint32_t>(i)) {
-            meta.emplace_back(h.gridSize(i), header.codec, *grid);
-        } else if (auto* grid = h.template grid<int64_t>(i)) {
-            meta.emplace_back(h.gridSize(i), header.codec, *grid);
-        } else if (auto* grid = h.template grid<int16_t>(i)) {
-            meta.emplace_back(h.gridSize(i), header.codec, *grid);
-        } else if (auto* grid = h.template grid<Vec3d>(i)) {
-            meta.emplace_back(h.gridSize(i), header.codec, *grid);
-        } else if (auto* grid = h.template grid<ValueMask>(i)) {
-            meta.emplace_back(h.gridSize(i), header.codec, *grid);
-        } else if (auto* grid = h.template grid<ValueIndex>(i)) {
-            meta.emplace_back(h.gridSize(i), header.codec, *grid);
-        } else if (auto* grid = h.template grid<ValueIndexMask>(i)) {
-            meta.emplace_back(h.gridSize(i), header.codec, *grid);
-        } else if (auto* grid = h.template grid<ValueOnIndex>(i)) {
-            meta.emplace_back(h.gridSize(i), header.codec, *grid);
-        } else if (auto* grid = h.template grid<ValueOnIndexMask>(i)) {
-            meta.emplace_back(h.gridSize(i), header.codec, *grid);
-        } else if (auto* grid = h.template grid<bool>(i)) {
-            meta.emplace_back(h.gridSize(i), header.codec, *grid);
-        } else if (auto* grid = h.template grid<Rgba8>(i)) {
-            meta.emplace_back(h.gridSize(i), header.codec, *grid);
-        } else if (auto* grid = h.template grid<Fp4>(i)) {
-            meta.emplace_back(h.gridSize(i), header.codec, *grid);
-        } else if (auto* grid = h.template grid<Fp8>(i)) {
-            meta.emplace_back(h.gridSize(i), header.codec, *grid);
-        } else if (auto* grid = h.template grid<Fp16>(i)) {
-            meta.emplace_back(h.gridSize(i), header.codec, *grid);
-        } else if (auto* grid = h.template grid<FpN>(i)) {
-            meta.emplace_back(h.gridSize(i), header.codec, *grid);
-        } else if (auto* grid = h.template grid<Vec4f>(i)) {
-            meta.emplace_back(h.gridSize(i), header.codec, *grid);
-        } else if (auto* grid = h.template grid<Vec4d>(i)) {
-            meta.emplace_back(h.gridSize(i), header.codec, *grid);
-        } else {
-            std::stringstream ss;
-            ss << "nanovdb::io::Segment::add: Cannot write grid of unknown type \""<<toStr(h.gridType(i));
-            throw std::runtime_error(ss.str() + "\" to file");
-        }
-    }
-    header.gridCount += h.gridCount();
-}// Segment::add
-
-inline void Segment::write(std::ostream& os) const
-{
-    if (header.gridCount == 0) {
-        throw std::runtime_error("Segment contains no grids");
-    } else if (!os.write(reinterpret_cast<const char*>(&header), sizeof(FileHeader))) {
-        throw std::runtime_error("Failed to write FileHeader of Segment");
-    }
-    for (auto& m : meta) m.write(os);
-}// Segment::write
-
-inline bool Segment::read(std::istream& is)
-{
-    is.read(reinterpret_cast<char*>(&header), sizeof(FileHeader));
-    if (is.eof()) {// The EOF flag is only set once a read tries to read past the end of the file
-        is.clear(std::ios_base::eofbit);// clear eof flag so we can rewind and read again
-        return false;
-    }
-    if (!header.isValid()) {
-        // first check for byte-swapped header magic.
-        if (header.magic == reverseEndianness(NANOVDB_MAGIC_NUMBER) ||
-            header.magic == reverseEndianness(NANOVDB_MAGIC_FILE)) {
-            throw std::runtime_error("This nvdb file has reversed endianness");
-        } else {
-            throw std::runtime_error("Magic number error: This is not a valid nvdb file");
-        }
-    } else if ( !header.version.isCompatible()) {
-        std::stringstream ss;
-        Version v;
-        is.read(reinterpret_cast<char*>(&v), sizeof(Version));// read GridData::mVersion located at byte 16=sizeof(FileHeader) is stream
-        if ( v.getMajor() == NANOVDB_MAJOR_VERSION_NUMBER) {
-            ss << "This file looks like it contains a raw grid buffer and not a standard file with meta data";
-        } else if ( header.version.getMajor() < NANOVDB_MAJOR_VERSION_NUMBER) {
-            ss << "The file contains an older version of NanoVDB: " << std::string(header.version.c_str()) << "!\n\t"
-               << "Recommendation: Re-generate this NanoVDB file with this version: " << NANOVDB_MAJOR_VERSION_NUMBER << ".X of NanoVDB";
-        } else {
-            ss << "This tool was compiled against an older version of NanoVDB: " << NANOVDB_MAJOR_VERSION_NUMBER << ".X!\n\t"
-               << "Recommendation: Re-compile this tool against the newer version: " << header.version.getMajor() << ".X of NanoVDB";
-        }
-        throw std::runtime_error("An unrecoverable error in nanovdb::Segment::read:\n\tIncompatible file format: " + ss.str());
-    }
-    meta.resize(header.gridCount);
-    for (auto& m : meta) {
-        m.read(is);
-        m.version = header.version;
-    }
-    return true;
-}// Segment::read
-
-// --------------------------> writeGrid <------------------------------------
-
-template<typename BufferT>
-void writeGrid(std::ostream& os, const GridHandle<BufferT>& handle, Codec codec)
-{
-    Segment seg(codec);
-    seg.add(handle);
-    const auto start = os.tellp();
-    seg.write(os); // write header without the correct fileSize (so it's allocated)
-    for (uint32_t i = 0; i < handle.gridCount(); ++i) {
-        seg.meta[i].fileSize = Internal::write(os, handle, codec, i);
-    }
-    os.seekp(start);
-    seg.write(os);// re-write header with the correct fileSize
-    os.seekp(0, std::ios_base::end);// skip to end
-}// writeGrid
-
-template<typename BufferT>
-void writeGrid(const std::string& fileName, const GridHandle<BufferT>& handle, Codec codec, int verbose)
-{
-    std::ofstream os(fileName, std::ios::out | std::ios::binary | std::ios::trunc);
-    if (!os.is_open()) {
-        throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for output");
-    }
-    writeGrid<BufferT>(os, handle, codec);
-    if (verbose) {
-        std::cout << "Wrote nanovdb::Grid to file named \"" << fileName << "\"" << std::endl;
-    }
-}// writeGrid
-
-// --------------------------> writeGrids <------------------------------------
-
-template<typename BufferT = HostBuffer, template<typename...> class VecT = std::vector>
-void writeGrids(std::ostream& os, const VecT<GridHandle<BufferT>>& handles, Codec codec = Codec::NONE)
-{
-    for (auto& h : handles) writeGrid(os, h, codec);
-}// writeGrids
-
-template<typename BufferT, template<typename...> class VecT>
-void writeGrids(const std::string& fileName, const VecT<GridHandle<BufferT>>& handles, Codec codec, int verbose)
-{
-    std::ofstream os(fileName, std::ios::out | std::ios::binary | std::ios::trunc);
-    if (!os.is_open()) throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for output");
-    writeGrids<BufferT, VecT>(os, handles, codec);
-    if (verbose) std::cout << "Wrote " << handles.size() << " nanovdb::Grid(s) to file named \"" << fileName << "\"" << std::endl;
-}// writeGrids
-
-// --------------------------> readGrid <------------------------------------
-
-template<typename BufferT>
-GridHandle<BufferT> readGrid(std::istream& is, int n, const BufferT& pool)
-{
-    GridHandle<BufferT> handle;
-    if (n<0) {// read all grids into the same buffer
-        try {//first try to read a raw grid buffer
-            handle.read(is, pool);
-        } catch(const std::logic_error&) {
-            Segment seg;
-            uint64_t bufferSize = 0u;
-            uint32_t gridCount = 0u, gridIndex = 0u;
-            const auto start = is.tellg();
-            while (seg.read(is)) {
-                std::streamoff skipSize = 0;
-                for (auto& m : seg.meta) {
-                    ++gridCount;
-                    bufferSize += m.gridSize;
-                    skipSize   += m.fileSize;
-                }// loop over grids in segment
-                is.seekg(skipSize, std::ios_base::cur); // skip forward from the current position
-            }// loop over segments
-            auto buffer = BufferT::create(bufferSize, &pool);
-            char *ptr = (char*)buffer.data();
-            is.seekg(start);// rewind
-            while (seg.read(is)) {
-                for (auto& m : seg.meta) {
-                    Internal::read(is, ptr, m.gridSize, seg.header.codec);
-                    updateGridCount((GridData*)ptr, gridIndex++, gridCount);
-                    ptr += m.gridSize;
-                }// loop over grids in segment
-            }// loop over segments
-            return GridHandle<BufferT>(std::move(buffer));
-        }
-    } else {// read a specific grid
-        try {//first try to read a raw grid buffer
-            handle.read(is, uint32_t(n), pool);
-            updateGridCount((GridData*)handle.data(), 0u, 1u);
-        } catch(const std::logic_error&) {
-            Segment seg;
-            int counter = -1;
-            while (seg.read(is)) {
-                std::streamoff seek = 0;
-                for (auto& m : seg.meta) {
-                    if (++counter == n) {
-                        auto buffer = BufferT::create(m.gridSize, &pool);
-                        Internal::read(is, buffer, seg.header.codec);
-                        updateGridCount((GridData*)buffer.data(), 0u, 1u);
-                        return GridHandle<BufferT>(std::move(buffer));
-                    } else {
-                        seek += m.fileSize;
-                    }
-                }// loop over grids in segment
-                is.seekg(seek, std::ios_base::cur); // skip forward from the current position
-            }// loop over segments
-            if (n != counter) throw std::runtime_error("stream does not contain a #" + std::to_string(n) + " grid");
-        }
-    }
-    return handle;
-}// readGrid
-
-/// @brief Read the n'th grid
-template<typename BufferT>
-GridHandle<BufferT> readGrid(const std::string& fileName, int n, int verbose, const BufferT& buffer)
-{
-    std::ifstream is(fileName, std::ios::in | std::ios::binary);
-    if (!is.is_open()) throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for input");
-    auto handle = readGrid<BufferT>(is, n, buffer);
-    if (verbose) {
-        if (n<0) {
-            std::cout << "Read all NanoGrids from the file named \"" << fileName << "\"" << std::endl;
-        } else {
-            std::cout << "Read NanoGrid # " << n << " from the file named \"" << fileName << "\"" << std::endl;
-        }
-    }
-    return handle; // is converted to r-value and return value is move constructed.
-}// readGrid
-
-/// @brief Read a specific grid from an input stream given the name of the grid
-/// @tparam BufferT Buffer type used for allocation
-/// @param is input stream from which to read the grid
-/// @param gridName string name of the (first) grid to be returned
-/// @param pool optional memory pool from which to allocate the grid buffer
-/// @return Return the first grid in the input stream with a specific name
-/// @throw std::runtime_error with no grid exists with the specified name
-template<typename BufferT>
-GridHandle<BufferT> readGrid(std::istream& is, const std::string& gridName, const BufferT& pool)
-{
-    try {
-        GridHandle<BufferT> handle;
-        handle.read(is, gridName, pool);
-        return handle;
-    } catch(const std::logic_error&) {
-        const auto key = stringHash(gridName);
-        Segment seg;
-        while (seg.read(is)) {// loop over all segments in stream
-            std::streamoff seek = 0;
-            for (auto& m : seg.meta) {// loop over all grids in segment
-                if ((m.nameKey == 0u || m.nameKey == key) && m.gridName == gridName) { // check for hash key collision
-                    auto buffer = BufferT::create(m.gridSize, &pool);
-                    is.seekg(seek, std::ios_base::cur); // rewind
-                    Internal::read(is, buffer, seg.header.codec);
-                    updateGridCount((GridData*)buffer.data(), 0u, 1u);
-                    return GridHandle<BufferT>(std::move(buffer));
-                } else {
-                    seek += m.fileSize;
-                }
-            }
-            is.seekg(seek, std::ios_base::cur); // skip forward from the current position
-        }
-    }
-    throw std::runtime_error("Grid name '" + gridName + "' not found in file");
-}// readGrid
-
-/// @brief Read the first grid with a specific name
-template<typename BufferT>
-GridHandle<BufferT> readGrid(const std::string& fileName, const std::string& gridName, int verbose, const BufferT& buffer)
-{
-    std::ifstream is(fileName, std::ios::in | std::ios::binary);
-    if (!is.is_open()) throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for input");
-    auto handle = readGrid<BufferT>(is, gridName, buffer);
-    if (verbose) {
-        if (handle) {
-            std::cout << "Read NanoGrid named \"" << gridName << "\" from the file named \"" << fileName << "\"" << std::endl;
-        } else {
-            std::cout << "File named \"" << fileName << "\" does not contain a grid named \"" + gridName + "\"" << std::endl;
-        }
-    }
-    return handle; // is converted to r-value and return value is move constructed.
-}// readGrid
-
-// --------------------------> readGrids <------------------------------------
-
-template<typename BufferT = HostBuffer, template<typename...> class VecT = std::vector>
-VecT<GridHandle<BufferT>> readGrids(std::istream& is, const BufferT& pool = BufferT())
-{
-    VecT<GridHandle<BufferT>> handles;
-    Segment seg;
-    while (seg.read(is)) {
-        uint64_t bufferSize = 0;
-        for (auto& m : seg.meta) bufferSize += m.gridSize;
-        auto buffer = BufferT::create(bufferSize, &pool);
-        uint64_t bufferOffset = 0;
-        for (uint16_t i = 0; i < seg.header.gridCount; ++i) {
-            auto *data = reinterpret_cast<GridData*>(buffer.data() + bufferOffset);
-            Internal::read(is, (char*)data, seg.meta[i].gridSize, seg.header.codec);
-            updateGridCount(data, uint32_t(i), uint32_t(seg.header.gridCount));
-            bufferOffset += seg.meta[i].gridSize;
-        }// loop over grids in segment
-        handles.emplace_back(std::move(buffer)); // force move copy assignment
-    }// loop over segments
-    return handles; // is converted to r-value and return value is move constructed.
-}// readGrids
-
-/// @brief Read all the grids
-template<typename BufferT, template<typename...> class VecT>
-VecT<GridHandle<BufferT>> readGrids(const std::string& fileName, int verbose, const BufferT& buffer)
-{
-    std::ifstream is(fileName, std::ios::in | std::ios::binary);
-    if (!is.is_open()) throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for input");
-    auto handles = readGrids<BufferT, VecT>(is, buffer);
-    if (verbose) std::cout << "Read " << handles.size() << " NanoGrid(s) from the file named \"" << fileName << "\"" << std::endl;
-    return handles; // is converted to r-value and return value is move constructed.
-}// readGrids
-
-// --------------------------> readGridMetaData <------------------------------------
-
-inline std::vector<FileGridMetaData> readGridMetaData(const std::string& fileName)
-{
-    std::ifstream is(fileName, std::ios::in | std::ios::binary);
-    if (!is.is_open()) throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for input");
-    return readGridMetaData(is); // is converted to r-value and return value is move constructed.
-}// readGridMetaData
-
-inline std::vector<FileGridMetaData> readGridMetaData(std::istream& is)
-{
-    Segment seg;
-    std::vector<FileGridMetaData> meta;
-    try {
-        GridHandle<> handle;// if stream contains a raw grid buffer we unfortunately have to load everything
-        handle.read(is);
-        seg.add(handle);
-        meta = std::move(seg.meta);
-    } catch(const std::logic_error&) {
-        while (seg.read(is)) {
-            std::streamoff skip = 0;
-            for (auto& m : seg.meta) {
-                meta.push_back(m);
-                skip += m.fileSize;
-            }// loop over grid meta data in segment
-            is.seekg(skip, std::ios_base::cur);
-        }// loop over segments
-    }
-    return meta; // is converted to r-value and return value is move constructed.
-}// readGridMetaData
-
-// --------------------------> hasGrid <------------------------------------
-
-inline bool hasGrid(const std::string& fileName, const std::string& gridName)
-{
-    std::ifstream is(fileName, std::ios::in | std::ios::binary);
-    if (!is.is_open()) throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for input");
-    return hasGrid(is, gridName);
-}// hasGrid
-
-inline bool hasGrid(std::istream& is, const std::string& gridName)
-{
-    const auto key = stringHash(gridName);
-    Segment seg;
-    while (seg.read(is)) {
-        std::streamoff seek = 0;
-        for (auto& m : seg.meta) {
-            if (m.nameKey == key && m.gridName == gridName) return true; // check for hash key collision
-            seek += m.fileSize;
-        }// loop over grid meta data in segment
-        is.seekg(seek, std::ios_base::cur);
-    }// loop over segments
-    return false;
-}// hasGrid
-
-// --------------------------> stringHash <------------------------------------
-
-inline uint64_t stringHash(const char* c_str)
-{
-    uint64_t hash = 0;// zero is returned when cstr = nullptr or "\0"
-    if (c_str) {
-        for (auto* str = reinterpret_cast<const unsigned char*>(c_str); *str; ++str) {
-            uint64_t overflow = hash >> (64 - 8);
-            hash *= 67; // Next-ish prime after 26 + 26 + 10
-            hash += *str + overflow;
-        }
-    }
-    return hash;
-}// stringHash
-
-} // namespace io
-
-template<typename T>
-inline std::ostream&
-operator<<(std::ostream& os, const BBox<Vec3<T>>& b)
-{
-    os << "(" << b[0][0] << "," << b[0][1] << "," << b[0][2] << ") -> "
-       << "(" << b[1][0] << "," << b[1][1] << "," << b[1][2] << ")";
-    return os;
-}
-
-inline std::ostream&
-operator<<(std::ostream& os, const CoordBBox& b)
-{
-    os << "(" << b[0][0] << "," << b[0][1] << "," << b[0][2] << ") -> "
-       << "(" << b[1][0] << "," << b[1][1] << "," << b[1][2] << ")";
-    return os;
-}
-
-inline std::ostream&
-operator<<(std::ostream& os, const Coord& ijk)
-{
-    os << "(" << ijk[0] << "," << ijk[1] << "," << ijk[2] << ")";
-    return os;
-}
-
-template<typename T>
-inline std::ostream&
-operator<<(std::ostream& os, const Vec3<T>& v)
-{
-    os << "(" << v[0] << "," << v[1] << "," << v[2] << ")";
-    return os;
-}
-
-template<typename T>
-inline std::ostream&
-operator<<(std::ostream& os, const Vec4<T>& v)
-{
-    os << "(" << v[0] << "," << v[1] << "," << v[2] << "," << v[3] << ")";
-    return os;
-}
-
-} // namespace nanovdb
-
-#endif // NANOVDB_IO_H_HAS_BEEN_INCLUDED
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/io/IO.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/io/IO.h instead.")
diff --git a/nanovdb/nanovdb/util/Invoke.h b/nanovdb/nanovdb/util/Invoke.h
index 48e1ac0a42..f0e1561bb4 100644
--- a/nanovdb/nanovdb/util/Invoke.h
+++ b/nanovdb/nanovdb/util/Invoke.h
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: MPL-2.0
 
 /*!
-    \file Invoke.h
+    \file nanovdb/util/Invoke.h
 
     \author Ken Museth
 
@@ -16,8 +16,8 @@
     @endcode
 */
 
-#ifndef NANOVDB_INVOKE_H_HAS_BEEN_INCLUDED
-#define NANOVDB_INVOKE_H_HAS_BEEN_INCLUDED
+#ifndef NANOVDB_UTIL_INVOKE_H_HAS_BEEN_INCLUDED
+#define NANOVDB_UTIL_INVOKE_H_HAS_BEEN_INCLUDED
 
 #include <nanovdb/NanoVDB.h>// for nanovdb::CoordBBox
 
@@ -31,6 +31,8 @@
 
 namespace nanovdb {
 
+namespace util {
+
 namespace {
 #ifndef NANOVDB_USE_TBB
 // Base case
@@ -82,6 +84,14 @@ int invoke(const Func &taskFunc1, Rest... taskFuncN) {
     return -1;// should never happen
 }
 
+}// namespace util
+
+template<typename Func, typename... Rest>
+[[deprecated("Use nanovdb::util::invoke instead")]]
+int invoke(const Func &taskFunc1, Rest... taskFuncN) {
+    return util::invoke<Func, Rest...>(taskFunc1, taskFuncN...);
+}
+
 }// namespace nanovdb
 
-#endif // NANOVDB_INVOKE_H_HAS_BEEN_INCLUDED
+#endif // NANOVDB_UTIL_INVOKE_H_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/util/NanoToOpenVDB.h b/nanovdb/nanovdb/util/NanoToOpenVDB.h
index 8610afb9a8..ea7c956104 100644
--- a/nanovdb/nanovdb/util/NanoToOpenVDB.h
+++ b/nanovdb/nanovdb/util/NanoToOpenVDB.h
@@ -1,344 +1,6 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-/*!
-    \file NanoToOpenVDB.h
-
-    \author Ken Museth
-
-    \date May 6, 2020
-
-    \brief This class will deserialize an NanoVDB grid into an OpenVDB grid.
-
-    \todo Add support for PointIndexGrid and PointDataGrid
-*/
-
-#include <nanovdb/NanoVDB.h> // manages and streams the raw memory buffer of a NanoVDB grid.
-#include <nanovdb/util/GridHandle.h>
-#include "ForEach.h"
-
-#include <openvdb/openvdb.h>
-
-#ifndef NANOVDB_NANOTOOPENVDB_H_HAS_BEEN_INCLUDED
-#define NANOVDB_NANOTOOPENVDB_H_HAS_BEEN_INCLUDED
-
-template<typename T>
-struct ConvertTrait {using Type = T;};
-
-template<typename T>
-struct ConvertTrait<nanovdb::Vec3<T>> {using Type = openvdb::math::Vec3<T>;};
-
-template<typename T>
-struct ConvertTrait<nanovdb::Vec4<T>> {using Type = openvdb::math::Vec4<T>;};
-
-template<>
-struct ConvertTrait<nanovdb::Fp4> {using Type = float;};
-
-template<>
-struct ConvertTrait<nanovdb::Fp8> {using Type = float;};
-
-template<>
-struct ConvertTrait<nanovdb::Fp16> {using Type = float;};
-
-template<>
-struct ConvertTrait<nanovdb::FpN> {using Type = float;};
-
-template<>
-struct ConvertTrait<nanovdb::ValueMask> {using Type = openvdb::ValueMask;};
-
-namespace nanovdb {
-
-/// @brief Forward declaration of free-standing function that de-serializes a typed NanoVDB grid into an OpenVDB Grid
-template<typename NanoBuildT>
-typename openvdb::Grid<typename openvdb::tree::Tree4<typename ConvertTrait<NanoBuildT>::Type>::Type>::Ptr
-nanoToOpenVDB(const NanoGrid<NanoBuildT>& grid, int verbose = 0);
-
-/// @brief Forward declaration of free-standing function that de-serializes a NanoVDB GridHandle into an OpenVDB GridBase
-template<typename BufferT>
-openvdb::GridBase::Ptr
-nanoToOpenVDB(const GridHandle<BufferT>& handle, int verbose = 0, uint32_t n = 0);
-
-/// @brief This class will serialize an OpenVDB grid into a NanoVDB grid managed by a GridHandle.
-template<typename NanoBuildT>
-class NanoToOpenVDB
-{
-    using NanoNode0  = LeafNode<NanoBuildT, openvdb::Coord, openvdb::util::NodeMask>; // note that it's using openvdb coord nd mask types!
-    using NanoNode1  = InternalNode<NanoNode0>;
-    using NanoNode2  = InternalNode<NanoNode1>;
-    using NanoRootT  = RootNode<NanoNode2>;
-    using NanoTreeT  = Tree<NanoRootT>;
-    using NanoGridT  = Grid<NanoTreeT>;
-    using NanoValueT = typename NanoGridT::ValueType;
-
-    using OpenBuildT = typename ConvertTrait<NanoBuildT>::Type; // e.g. float -> float but nanovdb::Vec3<float> -> openvdb::Vec3<float>
-    using OpenNode0  = openvdb::tree::LeafNode<OpenBuildT, NanoNode0::LOG2DIM>; // leaf
-    using OpenNode1  = openvdb::tree::InternalNode<OpenNode0, NanoNode1::LOG2DIM>; // lower
-    using OpenNode2  = openvdb::tree::InternalNode<OpenNode1, NanoNode2::LOG2DIM>; // upper
-    using OpenRootT  = openvdb::tree::RootNode<OpenNode2>;
-    using OpenTreeT  = openvdb::tree::Tree<OpenRootT>;
-    using OpenGridT  = openvdb::Grid<OpenTreeT>;
-    using OpenValueT = typename OpenGridT::ValueType;
-
-public:
-    /// @brief Construction from an existing const OpenVDB Grid.
-    NanoToOpenVDB(){};
-
-    /// @brief Return a shared pointer to a NanoVDB grid constructed from the specified OpenVDB grid
-    typename OpenGridT::Ptr operator()(const NanoGrid<NanoBuildT>& grid, int verbose = 0);
-
-private:
-
-    template<typename NanoNodeT, typename OpenNodeT>
-    OpenNodeT* processNode(const NanoNodeT*);
-
-    OpenNode2* process(const NanoNode2* node) {return this->template processNode<NanoNode2, OpenNode2>(node);}
-    OpenNode1* process(const NanoNode1* node) {return this->template processNode<NanoNode1, OpenNode1>(node);}
-
-    template <typename NanoLeafT>
-    typename std::enable_if<!std::is_same<bool, typename NanoLeafT::BuildType>::value &&
-                            !std::is_same<ValueMask, typename NanoLeafT::BuildType>::value &&
-                            !std::is_same<Fp4, typename NanoLeafT::BuildType>::value &&
-                            !std::is_same<Fp8, typename NanoLeafT::BuildType>::value &&
-                            !std::is_same<Fp16,typename NanoLeafT::BuildType>::value &&
-                            !std::is_same<FpN, typename NanoLeafT::BuildType>::value,
-                            OpenNode0*>::type
-    process(const NanoLeafT* node);
-
-    template <typename NanoLeafT>
-    typename std::enable_if<std::is_same<Fp4, typename NanoLeafT::BuildType>::value ||
-                            std::is_same<Fp8, typename NanoLeafT::BuildType>::value ||
-                            std::is_same<Fp16,typename NanoLeafT::BuildType>::value ||
-                            std::is_same<FpN, typename NanoLeafT::BuildType>::value,
-                            OpenNode0*>::type
-    process(const NanoLeafT* node);
-
-    template <typename NanoLeafT>
-    typename std::enable_if<std::is_same<ValueMask, typename NanoLeafT::BuildType>::value,
-                            OpenNode0*>::type
-    process(const NanoLeafT* node);
-
-    template <typename NanoLeafT>
-    typename std::enable_if<std::is_same<bool, typename NanoLeafT::BuildType>::value,
-                            OpenNode0*>::type
-    process(const NanoLeafT* node);
-
-    /// converts nanovdb value types to openvdb value types, e.g. nanovdb::Vec3f& -> openvdb::Vec3f&
-    static const OpenValueT& Convert(const NanoValueT &v) {return reinterpret_cast<const OpenValueT&>(v);}
-    static const OpenValueT* Convert(const NanoValueT *v) {return reinterpret_cast<const OpenValueT*>(v);}
-
-}; // NanoToOpenVDB class
-
-template<typename NanoBuildT>
-typename NanoToOpenVDB<NanoBuildT>::OpenGridT::Ptr
-NanoToOpenVDB<NanoBuildT>::operator()(const NanoGrid<NanoBuildT>& grid, int /*verbose*/)
-{
-    // since the input nanovdb grid might use nanovdb types (Coord, Mask, Vec3) we cast to use openvdb types
-    const NanoGridT *srcGrid = reinterpret_cast<const NanoGridT*>(&grid);
-
-    auto dstGrid = openvdb::createGrid<OpenGridT>(Convert(srcGrid->tree().background()));
-    dstGrid->setName(srcGrid->gridName()); // set grid name
-    switch (srcGrid->gridClass()) { // set grid class
-    case nanovdb::GridClass::LevelSet:
-        dstGrid->setGridClass(openvdb::GRID_LEVEL_SET);
-        break;
-    case nanovdb::GridClass::FogVolume:
-        dstGrid->setGridClass(openvdb::GRID_FOG_VOLUME);
-        break;
-    case nanovdb::GridClass::Staggered:
-        dstGrid->setGridClass(openvdb::GRID_STAGGERED);
-        break;
-    case nanovdb::GridClass::PointIndex:
-        throw std::runtime_error("NanoToOpenVDB does not yet support PointIndexGrids");
-    case nanovdb::GridClass::PointData:
-        throw std::runtime_error("NanoToOpenVDB does not yet support PointDataGrids");
-    default:
-        dstGrid->setGridClass(openvdb::GRID_UNKNOWN);
-    }
-    // set transform
-    const nanovdb::Map& nanoMap = reinterpret_cast<const GridData*>(srcGrid)->mMap;
-    auto                mat = openvdb::math::Mat4<double>::identity();
-    mat.setMat3(openvdb::math::Mat3<double>(nanoMap.mMatD));
-    mat.transpose(); // the 3x3 in nanovdb is transposed relative to openvdb's 3x3
-    mat.setTranslation(openvdb::math::Vec3<double>(nanoMap.mVecD));
-    dstGrid->setTransform(openvdb::math::Transform::createLinearTransform(mat)); // calls simplify!
-
-    // process root node
-    auto &root = dstGrid->tree().root();
-    auto *data = srcGrid->tree().root().data();
-    for (uint32_t i=0; i<data->mTableSize; ++i) {
-        auto *tile = data->tile(i);
-        if (tile->isChild()) {
-            root.addChild( this->process( data->getChild(tile)) );
-        } else {
-            root.addTile(tile->origin(), Convert(tile->value), tile->state);
-        }
-    }
-
-    return dstGrid;
-}
-
-template<typename T>
-template<typename SrcNodeT, typename DstNodeT>
-DstNodeT*
-NanoToOpenVDB<T>::processNode(const SrcNodeT *srcNode)
-{
-    DstNodeT *dstNode = new DstNodeT(); // un-initialized for fast construction
-    dstNode->setOrigin(srcNode->origin());
-    const auto& childMask = srcNode->childMask();
-    const_cast<typename DstNodeT::NodeMaskType&>(dstNode->getValueMask()) = srcNode->valueMask();
-    const_cast<typename DstNodeT::NodeMaskType&>(dstNode->getChildMask()) = childMask;
-    auto* dstTable = const_cast<typename DstNodeT::UnionType*>(dstNode->getTable());
-    auto* srcData  = srcNode->data();
-    std::vector<std::pair<uint32_t, const typename SrcNodeT::ChildNodeType*>> childNodes;
-    const auto childCount = childMask.countOn();
-    childNodes.reserve(childCount);
-    for (uint32_t n = 0; n < DstNodeT::NUM_VALUES; ++n) {
-        if (childMask.isOn(n)) {
-            childNodes.emplace_back(n, srcData->getChild(n));
-        } else {
-            dstTable[n].setValue(Convert(srcData->mTable[n].value));
-        }
-    }
-    auto kernel = [&](const auto& r) {
-        for (auto i = r.begin(); i != r.end(); ++i) {
-            auto &p = childNodes[i];
-            dstTable[p.first].setChild( this->process(p.second) );
-        }
-    };
-
-#if 0
-    kernel(Range1D(0, childCount));
-#else
-    forEach(0, childCount, 1, kernel);
-#endif
-    return dstNode;
-} // processNode
-
-template<typename T>
-template <typename NanoLeafT>
-inline typename std::enable_if<!std::is_same<bool, typename NanoLeafT::BuildType>::value &&
-                               !std::is_same<ValueMask, typename NanoLeafT::BuildType>::value &&
-                               !std::is_same<Fp4, typename NanoLeafT::BuildType>::value &&
-                               !std::is_same<Fp8, typename NanoLeafT::BuildType>::value &&
-                               !std::is_same<Fp16,typename NanoLeafT::BuildType>::value &&
-                               !std::is_same<FpN, typename NanoLeafT::BuildType>::value,
-                               typename NanoToOpenVDB<T>::OpenNode0*>::type
-NanoToOpenVDB<T>::process(const NanoLeafT *srcNode)
-{
-    static_assert(std::is_same<NanoLeafT, NanoNode0>::value, "NanoToOpenVDB<FpN>::process assert failed");
-    OpenNode0* dstNode = new OpenNode0(); // un-initialized for fast construction
-    dstNode->setOrigin(srcNode->origin());
-    dstNode->setValueMask(srcNode->valueMask());
-
-    const auto* src = Convert(srcNode->data()->mValues);// doesn't work for compressed data, bool or ValueMask
-    for (auto *dst = dstNode->buffer().data(), *end = dst + OpenNode0::SIZE; dst != end; dst += 4, src += 4) {
-        dst[0] = src[0];
-        dst[1] = src[1];
-        dst[2] = src[2];
-        dst[3] = src[3];
-    }
-
-    return dstNode;
-} // process(NanoNode0)
-
-template<typename T>
-template <typename NanoLeafT>
-inline typename std::enable_if<std::is_same<Fp4, typename NanoLeafT::BuildType>::value ||
-                               std::is_same<Fp8, typename NanoLeafT::BuildType>::value ||
-                               std::is_same<Fp16,typename NanoLeafT::BuildType>::value ||
-                               std::is_same<FpN, typename NanoLeafT::BuildType>::value,
-                               typename NanoToOpenVDB<T>::OpenNode0*>::type
-NanoToOpenVDB<T>::process(const NanoLeafT *srcNode)
-{
-    static_assert(std::is_same<NanoLeafT, NanoNode0>::value, "NanoToOpenVDB<T>::process assert failed");
-    OpenNode0* dstNode = new OpenNode0(); // un-initialized for fast construction
-    dstNode->setOrigin(srcNode->origin());
-    dstNode->setValueMask(srcNode->valueMask());
-    float *dst = dstNode->buffer().data();
-    for (int i=0; i!=512; i+=4) {
-        *dst++ = srcNode->getValue(i);
-        *dst++ = srcNode->getValue(i+1);
-        *dst++ = srcNode->getValue(i+2);
-        *dst++ = srcNode->getValue(i+3);
-    }
-
-    return dstNode;
-} // process(NanoNode0)
-
-template<typename T>
-template <typename NanoLeafT>
-inline typename std::enable_if<std::is_same<ValueMask, typename NanoLeafT::BuildType>::value,
-                               typename NanoToOpenVDB<T>::OpenNode0*>::type
-NanoToOpenVDB<T>::process(const NanoLeafT *srcNode)
-{
-    static_assert(std::is_same<NanoLeafT, NanoNode0>::value, "NanoToOpenVDB<ValueMask>::process assert failed");
-    OpenNode0* dstNode = new OpenNode0(); // un-initialized for fast construction
-    dstNode->setOrigin(srcNode->origin());
-    dstNode->setValueMask(srcNode->valueMask());
-
-    return dstNode;
-} // process(NanoNode0)
-
-template<typename T>
-template <typename NanoLeafT>
-inline typename std::enable_if<std::is_same<bool, typename NanoLeafT::BuildType>::value,
-                               typename NanoToOpenVDB<T>::OpenNode0*>::type
-NanoToOpenVDB<T>::process(const NanoLeafT *srcNode)
-{
-    static_assert(std::is_same<NanoLeafT, NanoNode0>::value, "NanoToOpenVDB<ValueMask>::process assert failed");
-    OpenNode0* dstNode = new OpenNode0(); // un-initialized for fast construction
-    dstNode->setOrigin(srcNode->origin());
-    dstNode->setValueMask(srcNode->valueMask());
-    reinterpret_cast<openvdb::util::NodeMask<3>&>(dstNode->buffer()) = srcNode->data()->mValues;
-
-    return dstNode;
-} // process(NanoNode0)
-
-template<typename NanoBuildT>
-inline typename openvdb::Grid<typename openvdb::tree::Tree4<typename ConvertTrait<NanoBuildT>::Type>::Type>::Ptr
-nanoToOpenVDB(const NanoGrid<NanoBuildT>& grid, int verbose)
-{
-    nanovdb::NanoToOpenVDB<NanoBuildT> tmp;
-    return tmp(grid, verbose);
-}
-
-template<typename BufferT>
-openvdb::GridBase::Ptr
-nanoToOpenVDB(const GridHandle<BufferT>& handle, int verbose, uint32_t n)
-{
-    if (auto grid = handle.template grid<float>(n)) {
-        return nanovdb::nanoToOpenVDB(*grid, verbose);
-    } else if (auto grid = handle.template grid<double>(n)) {
-        return nanovdb::nanoToOpenVDB(*grid, verbose);
-    } else if (auto grid = handle.template grid<int32_t>(n)) {
-        return nanovdb::nanoToOpenVDB(*grid, verbose);
-    } else if (auto grid = handle.template grid<int64_t>(n)) {
-        return nanovdb::nanoToOpenVDB(*grid, verbose);
-    } else if (auto grid = handle.template grid<bool>(n)) {
-        return nanovdb::nanoToOpenVDB(*grid, verbose);
-    } else if (auto grid = handle.template grid<nanovdb::Fp4>(n)) {
-        return nanovdb::nanoToOpenVDB(*grid, verbose);
-    } else if (auto grid = handle.template grid<nanovdb::Fp8>(n)) {
-        return nanovdb::nanoToOpenVDB(*grid, verbose);
-    } else if (auto grid = handle.template grid<nanovdb::Fp16>(n)) {
-        return nanovdb::nanoToOpenVDB(*grid, verbose);
-    } else if (auto grid = handle.template grid<nanovdb::FpN>(n)) {
-        return nanovdb::nanoToOpenVDB(*grid, verbose);
-    } else if (auto grid = handle.template grid<nanovdb::ValueMask>(n)) {
-        return nanovdb::nanoToOpenVDB(*grid, verbose);
-    } else if (auto grid = handle.template grid<nanovdb::Vec3f>(n)) {
-        return nanovdb::nanoToOpenVDB(*grid, verbose);
-    } else if (auto grid = handle.template grid<nanovdb::Vec3d>(n)) {
-        return nanovdb::nanoToOpenVDB(*grid, verbose);
-    } else if (auto grid = handle.template grid<nanovdb::Vec4f>(n)) {
-        return nanovdb::nanoToOpenVDB(*grid, verbose);
-    } else if (auto grid = handle.template grid<nanovdb::Vec4d>(n)) {
-        return nanovdb::nanoToOpenVDB(*grid, verbose);
-    } else {
-        OPENVDB_THROW(openvdb::RuntimeError, "Unsupported NanoVDB grid type!");
-    }
-}
-
-} // namespace nanovdb
-
-#endif // NANOVDB_NANOTOOPENVDB_H_HAS_BEEN_INCLUDED
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/NanoToOpenVDB.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/tools/NanoToOpenVDB.h instead.")
diff --git a/nanovdb/nanovdb/util/NodeManager.h b/nanovdb/nanovdb/util/NodeManager.h
index 4da1eee873..5f665ee7f6 100644
--- a/nanovdb/nanovdb/util/NodeManager.h
+++ b/nanovdb/nanovdb/util/NodeManager.h
@@ -1,327 +1,6 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-/*!
-    \file NodeManager.h
-
-    \author Ken Museth
-
-    \date February 12, 2021
-
-    \brief This class allows for sequential access to nodes
-           in a NanoVDB tree on both the host and device.
-
-    \details The ordering of the sequential access to nodes is always breadth-first!
-*/
-
-#include <nanovdb/NanoVDB.h>// for NanoGrid etc
-#include "HostBuffer.h"// for HostBuffer
-
-#ifndef NANOVDB_NODEMANAGER_H_HAS_BEEN_INCLUDED
-#define NANOVDB_NODEMANAGER_H_HAS_BEEN_INCLUDED
-
-namespace nanovdb {
-
-/// @brief NodeManager allows for sequential access to nodes
-template <typename BuildT>
-class NodeManager;
-
-/// @brief NodeManagerHandle manages the memory of a NodeManager
-template<typename BufferT = HostBuffer>
-class NodeManagerHandle;
-
-/// @brief brief Construct a NodeManager and return its handle
-///
-/// @param grid grid whose nodes will be accessed sequentially
-/// @param buffer buffer from which to allocate the output handle
-///
-/// @note This is the only way to create a NodeManager since it's using
-///       managed memory pointed to by a NodeManagerHandle.
-template <typename BuildT, typename BufferT = HostBuffer>
-NodeManagerHandle<BufferT> createNodeManager(const NanoGrid<BuildT> &grid,
-                                             const BufferT& buffer = BufferT());
-
-struct NodeManagerData
-{// 48B = 6*8B
-    uint64_t        mMagic;// 8B
-    union {int64_t  mPadding; uint8_t mLinear;};// 8B of which 1B is used for a binary flag
-    void           *mGrid;//  8B pointer to either host or device grid
-    union {int64_t *mPtr[3], mOff[3];};// 24B, use mOff if mLinear!=0
-};
-
-/// @brief This class serves to manage a raw memory buffer of a NanoVDB NodeManager or LeafManager.
-template<typename BufferT>
-class NodeManagerHandle
-{
-    GridType mGridType{GridType::Unknown};
-    BufferT  mBuffer;
-
-    template<typename BuildT>
-    const NodeManager<BuildT>* getMgr() const {
-        return mGridType == mapToGridType<BuildT>() ? (const NodeManager<BuildT>*)mBuffer.data() : nullptr;
-    }
-
-    template<typename BuildT, typename U = BufferT>
-    typename enable_if<BufferTraits<U>::hasDeviceDual, const NodeManager<BuildT>*>::type
-    getDeviceMgr() const {
-        return mGridType == mapToGridType<BuildT>() ? (const NodeManager<BuildT>*)mBuffer.deviceData() : nullptr;
-    }
-
-    template <typename T>
-    static T* no_const(const T* ptr) { return const_cast<T*>(ptr); }
-
-public:
-    /// @brief Move constructor from a buffer
-    NodeManagerHandle(GridType gridType, BufferT&& buffer) : mGridType(gridType) { mBuffer = std::move(buffer); }
-    /// @brief Empty ctor
-    NodeManagerHandle() = default;
-    /// @brief Disallow copy-construction
-    NodeManagerHandle(const NodeManagerHandle&) = delete;
-    /// @brief Disallow copy assignment operation
-    NodeManagerHandle& operator=(const NodeManagerHandle&) = delete;
-    /// @brief Move copy assignment operation
-    NodeManagerHandle& operator=(NodeManagerHandle&& other) noexcept {
-        mGridType = other.mGridType;
-        mBuffer = std::move(other.mBuffer);
-        other.mGridType = GridType::Unknown;
-        return *this;
-    }
-    /// @brief Move copy-constructor
-    NodeManagerHandle(NodeManagerHandle&& other) noexcept {
-        mGridType = other.mGridType;
-        mBuffer = std::move(other.mBuffer);
-        other.mGridType = GridType::Unknown;
-    }
-    /// @brief Default destructor
-    ~NodeManagerHandle() { this->reset(); }
-    /// @brief clear the buffer
-    void reset() { mBuffer.clear(); }
-
-    /// @brief Return a reference to the buffer
-    BufferT& buffer() { return mBuffer; }
-
-    /// @brief Return a const reference to the buffer
-    const BufferT& buffer() const { return mBuffer; }
-
-    /// @brief Returns a non-const pointer to the data.
-    ///
-    /// @warning Note that the return pointer can be NULL if the NodeManagerHandle was not initialized
-    uint8_t* data() { return mBuffer.data(); }
-
-    /// @brief Returns a const pointer to the data.
-    ///
-    /// @warning Note that the return pointer can be NULL if the NodeManagerHandle was not initialized
-    const uint8_t* data() const { return mBuffer.data(); }
-
-    /// @brief Returns the size in bytes of the raw memory buffer managed by this NodeManagerHandle's allocator.
-    uint64_t size() const { return mBuffer.size(); }
-
-    /// @brief Returns a const pointer to the NodeManager encoded in this NodeManagerHandle.
-    ///
-    /// @warning Note that the return pointer can be NULL if the template parameter does not match the specified grid!
-    template<typename BuildT>
-    const NodeManager<BuildT>* mgr() const { return this->template getMgr<BuildT>(); }
-
-    /// @brief Returns a pointer to the NodeManager encoded in this NodeManagerHandle.
-    ///
-    /// @warning Note that the return pointer can be NULL if the template parameter does not match the specified grid!
-    template<typename BuildT>
-    NodeManager<BuildT>* mgr() { return no_const(this->template getMgr<BuildT>()); }
-
-    /// @brief Return a const pointer to the NodeManager encoded in this NodeManagerHandle on the device, e.g. GPU
-    ///
-    /// @warning Note that the return pointer can be NULL if the template parameter does not match the specified grid!
-    template<typename BuildT, typename U = BufferT>
-    typename enable_if<BufferTraits<U>::hasDeviceDual, const NodeManager<BuildT>*>::type
-    deviceMgr() const { return this->template getDeviceMgr<BuildT>(); }
-
-    /// @brief Return a const pointer to the NodeManager encoded in this NodeManagerHandle on the device, e.g. GPU
-    ///
-    /// @warning Note that the return pointer can be NULL if the template parameter does not match the specified grid!
-    template<typename BuildT, typename U = BufferT>
-    typename enable_if<BufferTraits<U>::hasDeviceDual, NodeManager<BuildT>*>::type
-    deviceMgr() { return no_const(this->template getDeviceMgr<BuildT>()); }
-
-    /// @brief Upload the NodeManager to the device, e.g. from CPU to GPU
-    ///
-    /// @note This method is only available if the buffer supports devices
-    template<typename U = BufferT>
-    typename enable_if<BufferTraits<U>::hasDeviceDual, void>::type
-    deviceUpload(void* deviceGrid, void* stream = nullptr, bool sync = true)
-    {
-        assert(deviceGrid);
-        auto *data = reinterpret_cast<NodeManagerData*>(mBuffer.data());
-        void *tmp = data->mGrid;
-        data->mGrid = deviceGrid;
-        mBuffer.deviceUpload(stream, sync);
-        data->mGrid = tmp;
-    }
-
-    /// @brief Download the NodeManager to from the device, e.g. from GPU to CPU
-    ///
-    /// @note This method is only available if the buffer supports devices
-    template<typename U = BufferT>
-    typename enable_if<BufferTraits<U>::hasDeviceDual, void>::type
-    deviceDownload(void* stream = nullptr, bool sync = true)
-    {
-        auto *data = reinterpret_cast<NodeManagerData*>(mBuffer.data());
-        void *tmp = data->mGrid;
-        mBuffer.deviceDownload(stream, sync);
-        data->mGrid = tmp;
-    }
-};// NodeManagerHandle
-
-/// @brief This class allows for sequential access to nodes in a NanoVDB tree
-///
-/// @details Nodes are always arranged breadth first during sequential access of nodes
-///          at a particular level.
-template<typename BuildT>
-class NodeManager : private NodeManagerData
-{
-    using DataT = NodeManagerData;
-    using GridT = NanoGrid<BuildT>;
-    using TreeT = typename GridTree<GridT>::type;
-    template<int LEVEL>
-    using NodeT = typename NodeTrait<TreeT, LEVEL>::type;
-    using RootT = NodeT<3>;// root node
-    using Node2 = NodeT<2>;// upper internal node
-    using Node1 = NodeT<1>;// lower internal node
-    using Node0 = NodeT<0>;// leaf node
-
-public:
-    static constexpr bool FIXED_SIZE = Node0::FIXED_SIZE && Node1::FIXED_SIZE && Node2::FIXED_SIZE;
-
-    NodeManager(const NodeManager&) = delete;
-    NodeManager(NodeManager&&) = delete;
-    NodeManager& operator=(const NodeManager&) = delete;
-    NodeManager& operator=(NodeManager&&) = delete;
-    ~NodeManager() = delete;
-
-    /// @brief return true if the nodes have both fixed size and are arranged breadth-first in memory.
-    ///        This allows for direct and memory-efficient linear access to nodes.
-    __hostdev__ static bool isLinear(const GridT &grid) {return FIXED_SIZE && grid.isBreadthFirst();}
-
-    /// @brief return true if the nodes have both fixed size and are arranged breadth-first in memory.
-    ///        This allows for direct and memory-efficient linear access to nodes.
-    __hostdev__ bool isLinear() const {return DataT::mLinear!=0u;}
-
-    /// @brief Return the memory footprint in bytes of the NodeManager derived from the specified grid
-    __hostdev__ static uint64_t memUsage(const GridT &grid) {
-        uint64_t size = sizeof(NodeManagerData);
-        if (!NodeManager::isLinear(grid)) {
-            const uint32_t *p = grid.tree().mNodeCount;
-            size += sizeof(int64_t)*(p[0]+p[1]+p[2]);
-        }
-        return size;
-    }
-
-    /// @brief Return the memory footprint in bytes of this instance
-    __hostdev__ uint64_t memUsage() const {return NodeManager::memUsage(this->grid());}
-
-    /// @brief Return a reference to the grid
-    __hostdev__       GridT& grid()       { return *reinterpret_cast<GridT*>(DataT::mGrid); }
-    __hostdev__ const GridT& grid() const { return *reinterpret_cast<const GridT*>(DataT::mGrid); }
-
-    /// @brief Return a reference to the tree
-    __hostdev__       TreeT& tree()       { return this->grid().tree(); }
-    __hostdev__ const TreeT& tree() const { return this->grid().tree(); }
-
-    /// @brief Return a reference to the root
-    __hostdev__       RootT& root()       { return this->tree().root(); }
-    __hostdev__ const RootT& root() const { return this->tree().root(); }
-
-    /// @brief Return the number of tree nodes at the specified level
-    /// @details 0 is leaf, 1 is lower internal, and 2 is upper internal level
-    __hostdev__ uint64_t nodeCount(int level) const { return this->tree().nodeCount(level); }
-
-    __hostdev__ uint64_t leafCount()  const { return this->tree().nodeCount(0); }
-    __hostdev__ uint64_t lowerCount() const { return this->tree().nodeCount(1); }
-    __hostdev__ uint64_t upperCount() const { return this->tree().nodeCount(2); }
-
-    /// @brief Return the i'th leaf node with respect to breadth-first ordering
-    template <int LEVEL>
-    __hostdev__ const NodeT<LEVEL>& node(uint32_t i) const {
-        NANOVDB_ASSERT(i < this->nodeCount(LEVEL));
-        const NodeT<LEVEL>* ptr = nullptr;
-        if (DataT::mLinear) {
-            ptr = PtrAdd<const NodeT<LEVEL>>(DataT::mGrid, DataT::mOff[LEVEL]) + i;
-        } else {
-            ptr = PtrAdd<const NodeT<LEVEL>>(DataT::mGrid, DataT::mPtr[LEVEL][i]);
-        }
-        NANOVDB_ASSERT(isValid(ptr));
-        return *ptr;
-    }
-
-    /// @brief Return the i'th node with respect to breadth-first ordering
-    template <int LEVEL>
-    __hostdev__ NodeT<LEVEL>& node(uint32_t i) {
-        NANOVDB_ASSERT(i < this->nodeCount(LEVEL));
-        NodeT<LEVEL>* ptr = nullptr;
-        if (DataT::mLinear) {
-            ptr = PtrAdd<NodeT<LEVEL>>(DataT::mGrid, DataT::mOff[LEVEL]) + i;
-        } else {
-            ptr = PtrAdd<NodeT<LEVEL>>(DataT::mGrid, DataT::mPtr[LEVEL][i]);
-        }
-        NANOVDB_ASSERT(isValid(ptr));
-        return *ptr;
-    }
-
-    /// @brief Return the i'th leaf node with respect to breadth-first ordering
-    __hostdev__ const Node0& leaf(uint32_t i) const { return this->node<0>(i); }
-    __hostdev__       Node0& leaf(uint32_t i)       { return this->node<0>(i); }
-
-    /// @brief Return the i'th lower internal node with respect to breadth-first ordering
-    __hostdev__ const Node1& lower(uint32_t i) const { return this->node<1>(i); }
-    __hostdev__       Node1& lower(uint32_t i)       { return this->node<1>(i); }
-
-    /// @brief Return the i'th upper internal node with respect to breadth-first ordering
-    __hostdev__ const Node2& upper(uint32_t i) const { return this->node<2>(i); }
-    __hostdev__       Node2& upper(uint32_t i)       { return this->node<2>(i); }
-
-}; // NodeManager<BuildT> class
-
-template <typename BuildT, typename BufferT>
-NodeManagerHandle<BufferT> createNodeManager(const NanoGrid<BuildT> &grid,
-                                             const BufferT& buffer)
-{
-    NodeManagerHandle<BufferT> handle(mapToGridType<BuildT>(), BufferT::create(NodeManager<BuildT>::memUsage(grid), &buffer));
-    auto *data = reinterpret_cast<NodeManagerData*>(handle.data());
-    NANOVDB_ASSERT(isValid(data));
-    NANOVDB_ASSERT(mapToGridType<BuildT>() == grid.gridType());
-#ifdef NANOVDB_USE_NEW_MAGIC_NUMBERS
-    *data = NodeManagerData{NANOVDB_MAGIC_NODE,   0u, (void*)&grid, {0u,0u,0u}};
-#else
-    *data = NodeManagerData{NANOVDB_MAGIC_NUMBER, 0u, (void*)&grid, {0u,0u,0u}};
-#endif
-
-    if (NodeManager<BuildT>::isLinear(grid)) {
-        data->mLinear = uint8_t(1u);
-        data->mOff[0] = PtrDiff(grid.tree().template getFirstNode<0>(), &grid);
-        data->mOff[1] = PtrDiff(grid.tree().template getFirstNode<1>(), &grid);
-        data->mOff[2] = PtrDiff(grid.tree().template getFirstNode<2>(), &grid);
-    } else {
-        int64_t *ptr0 = data->mPtr[0] = reinterpret_cast<int64_t*>(data + 1);
-        int64_t *ptr1 = data->mPtr[1] = data->mPtr[0] + grid.tree().nodeCount(0);
-        int64_t *ptr2 = data->mPtr[2] = data->mPtr[1] + grid.tree().nodeCount(1);
-        // Performs depth first traversal but breadth first insertion
-        for (auto it2 = grid.tree().root().cbeginChild(); it2; ++it2) {
-            *ptr2++ = PtrDiff(&*it2, &grid);
-            for (auto it1 = it2->cbeginChild(); it1; ++it1) {
-                *ptr1++ = PtrDiff(&*it1, &grid);
-                for (auto it0 = it1->cbeginChild(); it0; ++it0) {
-                    *ptr0++ = PtrDiff(&*it0, &grid);
-                }// loop over child nodes of the lower internal node
-            }// loop over child nodes of the upper internal node
-        }// loop over child nodes of the root node
-    }
-
-    return handle;// // is converted to r-value so return value is move constructed!
-}
-
-} // namespace nanovdb
-
-#if defined(__CUDACC__)
-#include <nanovdb/util/cuda/CudaNodeManager.cuh>
-#endif// defined(__CUDACC__)
-
-#endif // NANOVDB_NODEMANAGER_H_HAS_BEEN_INCLUDED
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/NodeManager.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/NodeManager.h instead.")
diff --git a/nanovdb/nanovdb/util/OpenToNanoVDB.h b/nanovdb/nanovdb/util/OpenToNanoVDB.h
index ea6c2c94d7..a4cecde1de 100644
--- a/nanovdb/nanovdb/util/OpenToNanoVDB.h
+++ b/nanovdb/nanovdb/util/OpenToNanoVDB.h
@@ -1,15 +1,6 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-/*!
-    \file OpenToNanoVDB.h
-
-    \author Ken Museth
-
-    \date January 8, 2020
-
-    \warning this file has been replaced by CreateNanoGrid.h
-
-*/
-
-#include "CreateNanoGrid.h"
\ No newline at end of file
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/CreateNanoGrid.h>
+NANOVDB_DEPRECATED_HEADER("Use nanovdb/tools/CreateNanoGrid.h instead.")
\ No newline at end of file
diff --git a/nanovdb/nanovdb/util/PrefixSum.h b/nanovdb/nanovdb/util/PrefixSum.h
index 87775c2d2a..0f70a81a10 100644
--- a/nanovdb/nanovdb/util/PrefixSum.h
+++ b/nanovdb/nanovdb/util/PrefixSum.h
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: MPL-2.0
 
 /*!
-    \file PrefixSum.h
+    \file nanovdb/util/PrefixSum.h
 
     \author Ken Museth
 
@@ -15,10 +15,10 @@
           last entry which is the sum of all the input elements.
 */
 
-#ifndef NANOVDB_PREFIX_SUM_H_HAS_BEEN_INCLUDED
-#define NANOVDB_PREFIX_SUM_H_HAS_BEEN_INCLUDED
+#ifndef NANOVDB_UTIL_PREFIX_SUM_H_HAS_BEEN_INCLUDED
+#define NANOVDB_UTIL_PREFIX_SUM_H_HAS_BEEN_INCLUDED
 
-#include "Range.h"// for Range1D
+#include <nanovdb/util/Range.h>// for Range1D
 #include <vector>
 #include <functional>// for std::plus
 
@@ -28,6 +28,8 @@
 
 namespace nanovdb {
 
+namespace util {
+
 /// @brief Computes inclusive prefix sum of a vector
 /// @tparam T Type of the elements in the input/out vector
 /// @tparam OpT Type of operation performed on each element (defaults to sum)
@@ -74,6 +76,15 @@ T prefixSum(std::vector<T> &vec, bool threaded, OpT op)
     return vec.back();// sum of all input elements
 }// prefixSum
 
+}// namespace util
+
+template<typename T, typename OpT = std::plus<T>>
+[[deprecated("Use nanovdb::util::prefixSum instead")]]
+T prefixSum(std::vector<T> &vec, bool threaded = true, OpT op = OpT())
+{
+    return util::prefixSum<T, OpT>(vec, threaded, op);
+}// prefixSum
+
 }// namespace nanovdb
 
-#endif // NANOVDB_PREFIX_SUM_H_HAS_BEEN_INCLUDED
+#endif // NANOVDB_UTIL_PREFIX_SUM_H_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/util/Primitives.h b/nanovdb/nanovdb/util/Primitives.h
index 7c1f3a5856..0d6714e0d2 100644
--- a/nanovdb/nanovdb/util/Primitives.h
+++ b/nanovdb/nanovdb/util/Primitives.h
@@ -1,1754 +1,6 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-/*!
-    \file Primitives.h
-
-    \author Ken Museth
-
-    \date June 26, 2020
-
-    \brief Generates volumetric primitives, e.g. sphere, torus etc, as NanoVDB grid.
-
-    \note This has no dependency on openvdb.
-*/
-
-#ifndef NANOVDB_PRIMITIVES_H_HAS_BEEN_INCLUDED
-#define NANOVDB_PRIMITIVES_H_HAS_BEEN_INCLUDED
-
-#define NANOVDB_PARALLEL_PRIMITIVES
-
-#include <nanovdb/NanoVDB.h>
-#include "CreateNanoGrid.h"
-#include <nanovdb/util/ForEach.h>
-
-namespace nanovdb {
-
-/// @brief Returns a handle to a narrow-band level set of a sphere
-///
-/// @param radius    Radius of sphere in world units
-/// @param center    Center of sphere in world units
-/// @param voxelSize Size of a voxel in world units
-/// @param halfWidth Half-width of narrow band in voxel units
-/// @param origin    Origin of grid in world units
-/// @param name      Name of the grid
-/// @param sMode     Mode of computation for the statistics.
-/// @param cMode     Mode of computation for the checksum.
-/// @param tolerance Global error tolerance use when VoxelT = FpN
-/// @param ditherOn  If true dithering will be applied when VoxelT = {Fp4,Fp8,Fp16,FpN}
-/// @param buffer    Buffer used for memory allocation by the handle
-///
-/// @details The @c BuildT template parameter must be one of the following:
-///          float (default), double, Fp4, Fp8, Fp16 or FpN. The @c tolerance
-///          argument is only used when BuildT is set to FpN.
-template<typename BuildT = float, typename BufferT = HostBuffer>
-typename enable_if<is_same<float,  BuildT>::value ||
-                   is_same<double, BuildT>::value, GridHandle<BufferT>>::type
-createLevelSetSphere(double              radius = 100.0,
-                     const Vec3d&        center = Vec3d(0),
-                     double              voxelSize = 1.0,
-                     double              halfWidth = 3.0,
-                     const Vec3d&        origin = Vec3d(0),
-                     const std::string&  name = "sphere_ls",
-                     StatsMode           sMode = StatsMode::Default,
-                     ChecksumMode        cMode = ChecksumMode::Default,
-                     const BufferT&      buffer = BufferT());
-
-template<typename BuildT, typename BufferT = HostBuffer>
-typename enable_if<is_same<Fp4,  BuildT>::value ||
-                   is_same<Fp8,  BuildT>::value ||
-                   is_same<Fp16, BuildT>::value, GridHandle<BufferT>>::type
-createLevelSetSphere(double              radius = 100.0,
-                     const Vec3d&        center = Vec3d(0),
-                     double              voxelSize = 1.0,
-                     double              halfWidth = 3.0,
-                     const Vec3d&        origin = Vec3d(0),
-                     const std::string&  name = "sphere_ls",
-                     StatsMode           sMode = StatsMode::Default,
-                     ChecksumMode        cMode = ChecksumMode::Default,
-                     bool                ditherOn = false,
-                     const BufferT&      buffer = BufferT());
-
-template<typename BuildT, typename BufferT = HostBuffer>
-typename enable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createLevelSetSphere(double              radius = 100.0,
-                     const Vec3d&        center = Vec3d(0),
-                     double              voxelSize = 1.0,
-                     double              halfWidth = 3.0,
-                     const Vec3d&        origin = Vec3d(0),
-                     const std::string&  name = "sphere_ls_FpN",
-                     StatsMode           sMode = StatsMode::Default,
-                     ChecksumMode        cMode = ChecksumMode::Default,
-                     float               tolerance = -1.0f,
-                     bool                ditherOn = false,
-                     const BufferT&      buffer = BufferT());
-
-//================================================================================================
-
-/// @brief Returns a handle to a sparse fog volume of a sphere such
-///        that the exterior is 0 and inactive, the interior is active
-///        with values varying smoothly from 0 at the surface of the
-///        sphere to 1 at the halfWidth and interior of the sphere.
-///
-/// @param radius    Radius of sphere in world units
-/// @param center    Center of sphere in world units
-/// @param voxelSize Size of a voxel in world units
-/// @param halfWidth Half-width of narrow band in voxel units
-/// @param origin    Origin of grid in world units
-/// @param name      Name of the grid
-/// @param sMode     Mode of computation for the statistics.
-/// @param cMode     Mode of computation for the checksum.
-/// @param tolerance Global error tolerance use when VoxelT = FpN
-/// @param ditherOn  If true dithering will be applied when BuildT = {Fp4,Fp8,Fp16,FpN}
-/// @param buffer    Buffer used for memory allocation by the handle
-///
-/// @details The @c BuildT template parameter must be one of the following:
-///          float (default), double, Fp4, Fp8, Fp16 or FpN. The @c tolerance
-///          argument is only used when BuildT is set to FpN.
-template<typename BuildT = float, typename BufferT = HostBuffer>
-typename disable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createFogVolumeSphere(double              radius = 100.0,
-                      const Vec3d&        center = Vec3d(0.0),
-                      double              voxelSize = 1.0,
-                      double              halfWidth = 3.0,
-                      const Vec3d&        origin = Vec3d(0.0),
-                      const std::string&  name = "sphere_fog",
-                      StatsMode           sMode = StatsMode::Default,
-                      ChecksumMode        cMode = ChecksumMode::Default,
-                      const BufferT&      buffer = BufferT());
-
-template<typename BuildT, typename BufferT = HostBuffer>
-typename enable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createFogVolumeSphere(double              radius = 100.0,
-                      const Vec3d&        center = Vec3d(0.0),
-                      double              voxelSize = 1.0,
-                      double              halfWidth = 3.0,
-                      const Vec3d&        origin = Vec3d(0.0),
-                      const std::string&  name = "sphere_fog",
-                      StatsMode           sMode = StatsMode::Default,
-                      ChecksumMode        cMode = ChecksumMode::Default,
-                      float               tolerance = -1.0f,
-                      bool                ditherOn = false,
-                      const BufferT&      buffer = BufferT());
-
-//================================================================================================
-
-/// @brief Returns a handle to a PointDataGrid containing points scattered
-///        on the surface of a sphere.
-///
-/// @param pointsPerVoxel Number of point per voxel on on the surface
-/// @param radius         Radius of sphere in world units
-/// @param center         Center of sphere in world units
-/// @param voxelSize      Size of a voxel in world units
-/// @param origin         Origin of grid in world units
-/// @param name           Name of the grid
-/// @param mode           Mode of computation for the checksum.
-/// @param buffer         Buffer used for memory allocation by the handle
-///
-/// @details The @c BuildT template parameter must be float (default) or double.
-template<typename BuildT = float, typename BufferT = HostBuffer>
-typename disable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createPointSphere(int                 pointsPerVoxel = 1,
-                  double              radius = 100.0,
-                  const Vec3d&        center = Vec3d(0.0),
-                  double              voxelSize = 1.0,
-                  const Vec3d&        origin = Vec3d(0.0),
-                  const std::string&  name = "sphere_points",
-                  ChecksumMode        mode = ChecksumMode::Default,
-                  const BufferT&      buffer = BufferT());
-
-//================================================================================================
-
-/// @brief Returns a handle to a narrow-band level set of a torus in the xz-plane
-///
-/// @param majorRadius Major radius of torus in world units
-/// @param minorRadius Minor radius of torus in world units
-/// @param center      Center of torus in world units
-/// @param voxelSize   Size of a voxel in world units
-/// @param halfWidth   Half-width of narrow band in voxel units
-/// @param origin      Origin of grid in world units
-/// @param name        Name of the grid
-/// @param sMode       Mode of computation for the statistics.
-/// @param cMode       Mode of computation for the checksum.
-/// @param tolerance   Global error tolerance use when VoxelT = FpN
-/// @param ditherOn    If true dithering will be applied when VoxelT = {Fp4,Fp8,Fp16,FpN}
-/// @param buffer      Buffer used for memory allocation by the handle
-///
-/// @details The @c BuildT template parameter must be one of the following:
-///          float (default), double, Fp4, Fp8, Fp16 or FpN. The @c tolerance
-///          argument is only used when BuildT is set to FpN.
-template<typename BuildT = float, typename BufferT = HostBuffer>
-typename disable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createLevelSetTorus(double              majorRadius = 100.0,
-                    double              minorRadius = 50.0,
-                    const Vec3d&        center = Vec3d(0.0),
-                    double              voxelSize = 1.0,
-                    double              halfWidth = 3.0,
-                    const Vec3d&        origin = Vec3d(0.0),
-                    const std::string&  name = "torus_ls",
-                    StatsMode           sMode = StatsMode::Default,
-                    ChecksumMode        cMode = ChecksumMode::Default,
-                    const BufferT&      buffer = BufferT());
-
-template<typename BuildT, typename BufferT = HostBuffer>
-typename enable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createLevelSetTorus(double              majorRadius = 100.0,
-                    double              minorRadius = 50.0,
-                    const Vec3d&        center = Vec3d(0.0),
-                    double              voxelSize = 1.0,
-                    double              halfWidth = 3.0,
-                    const Vec3d&        origin = Vec3d(0.0),
-                    const std::string&  name = "torus_ls",
-                    StatsMode           sMode = StatsMode::Default,
-                    ChecksumMode        cMode = ChecksumMode::Default,
-                    float               tolerance = -1.0f,
-                    bool                ditherOn = false,
-                    const BufferT&      buffer = BufferT());
-
-//================================================================================================
-
-/// @brief Returns a handle to a sparse fog volume of a torus in the xz-plane such
-///        that the exterior is 0 and inactive, the interior is active
-///        with values varying smoothly from 0 at the surface of the
-///        torus to 1 at the halfWidth and interior of the torus.
-///
-/// @param majorRadius Major radius of torus in world units
-/// @param minorRadius Minor radius of torus in world units
-/// @param center      Center of torus in world units
-/// @param voxelSize   Size of a voxel in world units
-/// @param halfWidth   Half-width of narrow band in voxel units
-/// @param origin      Origin of grid in world units
-/// @param name        Name of the grid
-/// @param sMode       Mode of computation for the statistics.
-/// @param cMode       Mode of computation for the checksum.
-/// @param tolerance   Global error tolerance use when VoxelT = FpN
-/// @param ditherOn    If true dithering will be applied when VoxelT = {Fp4,Fp8,Fp16,FpN}
-/// @param buffer      Buffer used for memory allocation by the handle
-///
-/// @details The @c BuildT template parameter must be one of the following:
-///          float (default), double, Fp4, Fp8, Fp16 or FpN. The @c tolerance
-///          argument is only used when BuildT is set to FpN.
-template<typename BuildT = float, typename BufferT = HostBuffer>
-typename disable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createFogVolumeTorus(double              majorRadius = 100.0,
-                     double              minorRadius = 50.0,
-                     const Vec3d&        center = Vec3d(0.0),
-                     double              voxelSize = 1.0,
-                     double              halfWidth = 3.0,
-                     const Vec3d&        origin = Vec3d(0.0),
-                     const std::string&  name = "torus_fog",
-                     StatsMode           sMode = StatsMode::Default,
-                     ChecksumMode        cMode = ChecksumMode::Default,
-                     const BufferT&      buffer = BufferT());
-
-template<typename BuildT, typename BufferT = HostBuffer>
-typename enable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createFogVolumeTorus(double              majorRadius = 100.0,
-                     double              minorRadius = 50.0,
-                     const Vec3d&        center = Vec3d(0.0),
-                     double              voxelSize = 1.0,
-                     double              halfWidth = 3.0,
-                     const Vec3d&        origin = Vec3d(0.0),
-                     const std::string&  name = "torus_fog_FpN",
-                     StatsMode           sMode = StatsMode::Default,
-                     ChecksumMode        cMode = ChecksumMode::Default,
-                     float               tolerance = -1.0f,
-                     bool                ditherOn = false,
-                     const BufferT&      buffer = BufferT());
-
-//================================================================================================
-
-/// @brief Returns a handle to a PointDataGrid containing points scattered
-///        on the surface of a torus.
-///
-/// @param pointsPerVoxel Number of point per voxel on on the surface
-/// @param majorRadius    Major radius of torus in world units
-/// @param minorRadius    Minor radius of torus in world units
-/// @param center         Center of torus in world units
-/// @param voxelSize      Size of a voxel in world units
-/// @param origin         Origin of grid in world units
-/// @param name           Name of the grid
-/// @param cMode          Mode of computation for the checksum.
-/// @param buffer         Buffer used for memory allocation by the handle
-//
-/// @details The @c BuildT template parameter must be float (default) or double.
-template<typename BuildT = float, typename BufferT = HostBuffer>
-typename disable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createPointTorus(int                 pointsPerVoxel = 1, // half-width of narrow band in voxel units
-                 double              majorRadius = 100.0, // major radius of torus in world units
-                 double              minorRadius = 50.0, // minor radius of torus in world units
-                 const Vec3d&        center = Vec3d(0.0), // center of torus in world units
-                 double              voxelSize = 1.0, // size of a voxel in world units
-                 const Vec3d&        origin = Vec3d(0.0f), // origin of grid in world units
-                 const std::string&  name = "torus_points", // name of grid
-                 ChecksumMode        cMode = ChecksumMode::Default,
-                 const BufferT&      buffer = BufferT());
-
-//================================================================================================
-
-/// @brief Returns a handle to a narrow-band level set of a box
-///
-/// @param width     Width of box in world units
-/// @param height    Height of box in world units
-/// @param depth     Depth of box in world units
-/// @param center    Center of box in world units
-/// @param voxelSize Size of a voxel in world units
-/// @param halfWidth Half-width of narrow band in voxel units
-/// @param origin    Origin of grid in world units
-/// @param name      Name of the grid
-/// @param sMode     Mode of computation for the statistics.
-/// @param cMode     Mode of computation for the checksum.
-/// @param tolerance Global error tolerance use when VoxelT = FpN
-/// @param ditherOn  If true dithering will be applied when VoxelT = {Fp4,Fp8,Fp16,FpN}
-/// @param buffer    Buffer used for memory allocation by the handle
-///
-/// @details The @c BuildT template parameter must be one of the following:
-///          float (default), double, Fp4, Fp8, Fp16 or FpN. The @c tolerance
-///          argument is only used when BuildT is set to FpN.
-template<typename BuildT = float, typename BufferT = HostBuffer>
-typename disable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createLevelSetBox(double              width = 40.0,
-                  double              height = 60.0,
-                  double              depth = 100.0,
-                  const Vec3d& center = Vec3d(0.0),
-                  double              voxelSize = 1.0,
-                  double              halfWidth = 3.0,
-                  const Vec3d&        origin = Vec3d(0.0),
-                  const std::string&  name = "box_ls",
-                  StatsMode           sMode = StatsMode::Default,
-                  ChecksumMode        cMode = ChecksumMode::Default,
-                  const BufferT&      buffer = BufferT());
-
-template<typename BuildT, typename BufferT = HostBuffer>
-typename enable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createLevelSetBox(double              width = 40.0,
-                  double              height = 60.0,
-                  double              depth = 100.0,
-                  const Vec3d& center = Vec3d(0.0),
-                  double              voxelSize = 1.0,
-                  double              halfWidth = 3.0,
-                  const Vec3d&        origin = Vec3d(0.0),
-                  const std::string&  name = "box_ls_FpN",
-                  StatsMode           sMode = StatsMode::Default,
-                  ChecksumMode        cMode = ChecksumMode::Default,
-                  float               tolerance = -1.0f,
-                  bool                ditherOn = false,
-                  const BufferT&      buffer = BufferT());
-
-//================================================================================================
-
-/// @brief Returns a handle to a sparse fog volume of a box such
-///        that the exterior is 0 and inactive, the interior is active
-///        with values varying smoothly from 0 at the surface of the
-///        box to 1 at the halfWidth and interior of the box.
-///
-/// @param width     Width of box in world units
-/// @param height    Height of box in world units
-/// @param depth     Depth of box in world units
-/// @param center    Center of box in world units
-/// @param voxelSize Size of a voxel in world units
-/// @param halfWidth Half-width of narrow band in voxel units
-/// @param origin    Origin of grid in world units
-/// @param name      Name of the grid
-/// @param sMode     Mode of computation for the statistics.
-/// @param cMode     Mode of computation for the checksum.
-/// @param tolerance Global error tolerance use when VoxelT = FpN
-/// @param ditherOn  If true dithering will be applied when VoxelT = {Fp4,Fp8,Fp16,FpN}
-/// @param buffer    Buffer used for memory allocation by the handle
-///
-/// @details The @c BuildT template parameter must be one of the following:
-///          float (default), double, Fp4, Fp8, Fp16 or FpN. The @c tolerance
-///          argument is only used when BuildT is set to FpN.
-template<typename BuildT = float, typename BufferT = HostBuffer>
-typename disable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createFogVolumeBox(double              width = 40.0,
-                   double              height = 60.0,
-                   double              depth = 100.0,
-                   const Vec3d& center = Vec3d(0.0),
-                   double              voxelSize = 1.0,
-                   double              halfWidth = 3.0,
-                   const Vec3d&        origin = Vec3d(0.0),
-                   const std::string&  name = "box_fog",
-                   StatsMode           sMode = StatsMode::Default,
-                   ChecksumMode        cMode = ChecksumMode::Default,
-                   const BufferT&      buffer = BufferT());
-
-template<typename BuildT, typename BufferT = HostBuffer>
-typename enable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createFogVolumeBox(double              width = 40.0,
-                   double              height = 60.0,
-                   double              depth = 100.0,
-                   const Vec3d& center = Vec3d(0.0),
-                   double              voxelSize = 1.0,
-                   double              halfWidth = 3.0,
-                   const Vec3d&        origin = Vec3d(0.0),
-                   const std::string&  name = "box_fog_FpN",
-                   StatsMode           sMode = StatsMode::Default,
-                   ChecksumMode        cMode = ChecksumMode::Default,
-                   float               tolerance = -1.0f,
-                   bool                ditherOn = false,
-                   const BufferT&      buffer = BufferT());
-
-//================================================================================================
-
-/// @brief Returns a handle to a narrow-band level set of a octahedron
-///
-/// @param scale     Scale of octahedron in world units
-/// @param center    Center of octahedron in world units
-/// @param voxelSize Size of a voxel in world units
-/// @param halfWidth Half-width of narrow band in voxel units
-/// @param origin    Origin of grid in world units
-/// @param name      Name of the grid
-/// @param sMode     Mode of computation for the statistics.
-/// @param cMode     Mode of computation for the checksum.
-/// @param tolerance Global error tolerance use when VoxelT = FpN
-/// @param ditherOn  If true dithering will be applied when VoxelT = {Fp4,Fp8,Fp16,FpN}
-/// @param buffer    Buffer used for memory allocation by the handle
-///
-/// @details The @c BuildT template parameter must be one of the following:
-///          float (default), double, Fp4, Fp8, Fp16 or FpN. The @c tolerance
-///          argument is only used when BuildT is set to FpN.
-template<typename BuildT = float, typename BufferT = HostBuffer>
-typename disable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createLevelSetOctahedron(double              scale = 100.0,
-                         const Vec3d&        center = Vec3d(0.0),
-                         double              voxelSize = 1.0,
-                         double              halfWidth = 3.0,
-                         const Vec3d&        origin = Vec3d(0.0),
-                         const std::string&  name = "octadedron_ls",
-                         StatsMode           sMode = StatsMode::Default,
-                         ChecksumMode        cMode = ChecksumMode::Default,
-                         const BufferT&      buffer = BufferT());
-
-template<typename BuildT, typename BufferT = HostBuffer>
-typename enable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createLevelSetOctahedron(double              scale = 100.0,
-                         const Vec3d&        center = Vec3d(0.0),
-                         double              voxelSize = 1.0,
-                         double              halfWidth = 3.0,
-                         const Vec3d&        origin = Vec3d(0.0),
-                         const std::string&  name = "octadedron_ls_FpN",
-                         StatsMode           sMode = StatsMode::Default,
-                         ChecksumMode        cMode = ChecksumMode::Default,
-                         float               tolerance = -1.0f,
-                         bool                ditherOn = false,
-                         const BufferT&      buffer = BufferT());
-
-//================================================================================================
-
-/// @brief Returns a handle to a sparse fog volume of an octahedron such
-///        that the exterior is 0 and inactive, the interior is active
-///        with values varying smoothly from 0 at the surface of the
-///        octahedron to 1 at the halfWidth and interior of the octahedron.
-///
-/// @param scale     Scale of octahedron in world units
-/// @param center    Center of box in world units
-/// @param voxelSize Size of a voxel in world units
-/// @param halfWidth Half-width of narrow band in voxel units
-/// @param origin    Origin of grid in world units
-/// @param name      Name of the grid
-/// @param sMode     Mode of computation for the statistics.
-/// @param cMode     Mode of computation for the checksum.
-/// @param tolerance Global error tolerance use when VoxelT = FpN
-/// @param ditherOn  If true dithering will be applied when VoxelT = {Fp4,Fp8,Fp16,FpN}
-/// @param buffer    Buffer used for memory allocation by the handle
-///
-/// @details The @c BuildT template parameter must be one of the following:
-///          float (default), double, Fp4, Fp8, Fp16 or FpN. The @c tolerance
-///          argument is only used when BuildT is set to FpN.
-template<typename BuildT = float, typename BufferT = HostBuffer>
-typename disable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createFogVolumeOctahedron(double              scale = 100.0,
-                          const Vec3d& center = Vec3d(0.0),
-                          double              voxelSize = 1.0,
-                          double              halfWidth = 3.0,
-                          const Vec3d&        origin = Vec3d(0.0),
-                          const std::string&  name = "octadedron_fog",
-                          StatsMode           sMode = StatsMode::Default,
-                          ChecksumMode        cMode = ChecksumMode::Default,
-                          const BufferT&      buffer = BufferT());
-
-template<typename BuildT, typename BufferT = HostBuffer>
-typename enable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createFogVolumeOctahedron(double              scale = 100.0,
-                          const Vec3d& center = Vec3d(0.0),
-                          double              voxelSize = 1.0,
-                          double              halfWidth = 3.0,
-                          const Vec3d&        origin = Vec3d(0.0),
-                          const std::string&  name = "octadedron_fog_FpN",
-                          StatsMode           sMode = StatsMode::Default,
-                          ChecksumMode        cMode = ChecksumMode::Default,
-                          float               tolerance = -1.0f,
-                          bool                ditherOn = false,
-                          const BufferT&      buffer = BufferT());
-
-//================================================================================================
-
-/// @brief Returns a handle to a narrow-band level set of a bounding-box (= wireframe of a box)
-///
-/// @param width     Width of box in world units
-/// @param height    Height of box in world units
-/// @param depth     Depth of box in world units
-/// @param thickness Thickness of the wire in world units
-/// @param center    Center of bbox in world units
-/// @param voxelSize Size of a voxel in world units
-/// @param halfWidth Half-width of narrow band in voxel units
-/// @param origin    Origin of grid in world units
-/// @param name      Name of the grid
-/// @param sMode     Mode of computation for the statistics.
-/// @param cMode     Mode of computation for the checksum.
-/// @param tolerance Global error tolerance use when VoxelT = FpN
-/// @param ditherOn  If true dithering will be applied when VoxelT = {Fp4,Fp8,Fp16,FpN}
-/// @param buffer    Buffer used for memory allocation by the handle
-///
-/// @details The @c BuildT template parameter must be one of the following:
-///          float (default), double, Fp4, Fp8, Fp16 or FpN. The @c tolerance
-///          argument is only used when BuildT is set to FpN.
-template<typename BuildT = float, typename BufferT = HostBuffer>
-typename disable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createLevelSetBBox(double              width = 40.0,
-                   double              height = 60.0,
-                   double              depth = 100.0,
-                   double              thickness = 10.0,
-                   const Vec3d&        center = Vec3d(0.0),
-                   double              voxelSize = 1.0,
-                   double              halfWidth = 3.0,
-                   const Vec3d&        origin = Vec3d(0.0),
-                   const std::string&  name = "bbox_ls",
-                   StatsMode           sMode = StatsMode::Default,
-                   ChecksumMode        cMode = ChecksumMode::Default,
-                   const BufferT&      buffer = BufferT());
-
-template<typename BuildT, typename BufferT = HostBuffer>
-typename enable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createLevelSetBBox(double              width = 40.0,
-                   double              height = 60.0,
-                   double              depth = 100.0,
-                   double              thickness = 10.0,
-                   const Vec3d&        center = Vec3d(0.0),
-                   double              voxelSize = 1.0,
-                   double              halfWidth = 3.0,
-                   const Vec3d&        origin = Vec3d(0.0),
-                   const std::string&  name = "bbox_ls_FpN",
-                   StatsMode           sMode = StatsMode::Default,
-                   ChecksumMode        cMode = ChecksumMode::Default,
-                   float               tolerance = -1.0f,
-                   bool                ditherOn = false,
-                   const BufferT&      buffer = BufferT());
-
-
-//================================================================================================
-
-/// @brief Returns a handle to a PointDataGrid containing points scattered
-///        on the surface of a box.
-///
-/// @param pointsPerVoxel Number of point per voxel on on the surface
-/// @param width     Width of box in world units
-/// @param height    Height of box in world units
-/// @param depth     Depth of box in world units
-/// @param center    Center of box in world units
-/// @param voxelSize Size of a voxel in world units
-/// @param origin    Origin of grid in world units
-/// @param name      Name of the grid
-/// @param mode      Mode of computation for the checksum.
-/// @param buffer    Buffer used for memory allocation by the handle
-template<typename BuildT = float, typename BufferT = HostBuffer>
-typename disable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createPointBox(int                 pointsPerVoxel = 1, // half-width of narrow band in voxel units
-               double              width = 40.0, // width of box in world units
-               double              height = 60.0, // height of box in world units
-               double              depth = 100.0, // depth of box in world units
-               const Vec3d& center = Vec3d(0.0), // center of box in world units
-               double              voxelSize = 1.0, // size of a voxel in world units
-               const Vec3d&        origin = Vec3d(0.0), // origin of grid in world units
-               const std::string&  name = "box_points", // name of grid
-               ChecksumMode        mode = ChecksumMode::Default,
-               const BufferT&      buffer = BufferT());
-
-//================================================================================================
-
-/// @brief Given an input NanoVDB voxel grid this methods returns a GridHandle to another NanoVDB
-///        PointDataGrid with points scattered in the active leaf voxels of in input grid. Note, the
-///        coordinates of the points are encoded as blind data in world-space.
-///
-/// @param srcGrid        Const input grid used to determine the active voxels to scatter points into
-/// @param pointsPerVoxel Number of point per voxel on on the surface
-/// @param name           Name of the grid
-/// @param mode           Mode of computation for the checksum.
-/// @param buffer         Buffer used for memory allocation by the handle
-template<typename SrcBuildT = float, typename BufferT = HostBuffer>
-inline GridHandle<BufferT>
-createPointScatter(const NanoGrid<SrcBuildT>& srcGrid, // source grid used to scatter points into
-                   int                        pointsPerVoxel = 1, // half-width of narrow band in voxel units
-                   const std::string&         name = "point_scatter", // name of grid
-                   ChecksumMode               mode = ChecksumMode::Default,
-                   const BufferT&             buffer = BufferT());
-
-//================================================================================================
-
-namespace {
-
-/// @brief Returns a shared pointer to a build::Grid containing a narrow-band SDF values for a sphere
-///
-/// @brief Note, this is not (yet) a valid level set SDF field since values inside sphere (and outside
-///        the narrow band) are still undefined. Call builder::sdfToLevelSet() to set those
-///        values or alternatively call builder::levelSetToFog to generate a FOG volume.
-///
-/// @details The @c BuildT template parameter must be one of the following:
-///          float (default), double, Fp4, Fp8, Fp16 or FpN.
-template<typename BuildT>
-std::shared_ptr<build::Grid<BuildT>>
-initSphere(double              radius, // radius of sphere in world units
-           const Vec3d&        center, // center of sphere in world units
-           double              voxelSize, // size of a voxel in world units
-           double              halfWidth, // half-width of narrow band in voxel units
-           const Vec3d&        origin) // origin of grid in world units
-{
-    using GridT = build::Grid<BuildT>;
-    using ValueT = typename BuildToValueMap<BuildT>::type;
-    static_assert(is_floating_point<ValueT>::value, "initSphere: expect floating point");
-    if (!(radius > 0))
-        throw std::runtime_error("Sphere: radius must be positive!");
-    if (!(voxelSize > 0))
-        throw std::runtime_error("Sphere: voxelSize must be positive!");
-    if (!(halfWidth > 0))
-        throw std::runtime_error("Sphere: halfWidth must be positive!");
-
-    auto grid = std::make_shared<GridT>(ValueT(halfWidth * voxelSize));
-    grid->setTransform(voxelSize, origin);
-
-    // Define radius of sphere with narrow-band in voxel units
-    const ValueT r0 = radius / ValueT(voxelSize), rmax = r0 + ValueT(halfWidth);
-
-    // Radius below the Nyquist frequency
-    if (r0 < ValueT(1.5f)) return grid;
-
-    // Define center of sphere in voxel units
-    const Vec3<ValueT> c(ValueT(center[0] - origin[0]) / ValueT(voxelSize),
-                         ValueT(center[1] - origin[1]) / ValueT(voxelSize),
-                         ValueT(center[2] - origin[2]) / ValueT(voxelSize));
-
-    // Define bounds of the voxel coordinates
-    const int imin = Floor(c[0] - rmax), imax = Ceil(c[0] + rmax);
-    const int jmin = Floor(c[1] - rmax), jmax = Ceil(c[1] + rmax);
-    const int kmin = Floor(c[2] - rmax), kmax = Ceil(c[2] + rmax);
-
-    const Range<1,int> range(imin, imax+1, 32);
-
-    auto kernel = [&](const Range<1,int> &r) {
-        auto acc = grid->getWriteAccessor();
-        Coord ijk;
-        int &i = ijk[0], &j = ijk[1], &k = ijk[2], m = 1;
-        // Compute signed distances to sphere using leapfrogging in k
-        for (i = r.begin(); i < r.end(); ++i) {
-            const auto x2 = Pow2(ValueT(i) - c[0]);
-            for (j = jmin; j <= jmax; ++j) {
-                const auto x2y2 = Pow2(ValueT(j) - c[1]) + x2;
-                for (k = kmin; k <= kmax; k += m) {
-                    m = 1;
-                    const auto v = Sqrt(x2y2 + Pow2(ValueT(k) - c[2])) - r0; // Distance in voxel units
-                    const auto d = v < 0 ? -v : v;
-                    if (d < halfWidth) { // inside narrow band
-                        acc.setValue(ijk, ValueT(voxelSize) * v); // distance in world units
-                    } else { // outside narrow band
-                        m += Floor(d - halfWidth); // leapfrog
-                    }
-                } //end leapfrog over k
-            } //end loop over j
-        } //end loop over i
-    };// kernel
-#ifdef NANOVDB_PARALLEL_PRIMITIVES
-    forEach(range, kernel);
-#else
-    kernel(range);
-#endif
-    return grid;
-} // initSphere
-
-template<typename BuildT>
-std::shared_ptr<build::Grid<BuildT>>
-initTorus(double              radius1, // major radius of torus in world units
-          double              radius2, // minor radius of torus in world units
-          const Vec3d&        center, // center of torus in world units
-          double              voxelSize, // size of a voxel in world units
-          double              halfWidth, // half-width of narrow band in voxel units
-          const Vec3d&        origin) // origin of grid in world units
-{
-    using GridT = build::Grid<BuildT>;
-    using ValueT = typename BuildToValueMap<BuildT>::type;
-    static_assert(is_floating_point<ValueT>::value, "initTorus: expect floating point");
-    if (!(radius2 > 0))
-        throw std::runtime_error("Torus: radius2 must be positive!");
-    if (!(radius1 > radius2))
-        throw std::runtime_error("Torus: radius1 must be larger than radius2!");
-    if (!(voxelSize > 0))
-        throw std::runtime_error("Torus: voxelSize must be positive!");
-    if (!(halfWidth > 0))
-        throw std::runtime_error("Torus: halfWidth must be positive!");
-
-    auto grid = std::make_shared<GridT>(ValueT(halfWidth * voxelSize));
-    grid->setTransform(voxelSize, origin);
-
-    // Define size of torus with narrow-band in voxel units
-    const ValueT r1 = radius1 / ValueT(voxelSize), r2 = radius2 / ValueT(voxelSize), rmax1 = r1 + r2 + ValueT(halfWidth), rmax2 = r2 + ValueT(halfWidth);
-
-    // Radius below the Nyquist frequency
-    if (r2 < ValueT(1.5)) return grid;
-
-    // Define center of torus in voxel units
-    const Vec3<ValueT> c(ValueT(center[0] - origin[0]) / ValueT(voxelSize),
-                         ValueT(center[1] - origin[1]) / ValueT(voxelSize),
-                         ValueT(center[2] - origin[2]) / ValueT(voxelSize));
-
-    // Define bounds of the voxel coordinates
-    const int imin = Floor(c[0] - rmax1), imax = Ceil(c[0] + rmax1);
-    const int jmin = Floor(c[1] - rmax2), jmax = Ceil(c[1] + rmax2);
-    const int kmin = Floor(c[2] - rmax1), kmax = Ceil(c[2] + rmax1);
-
-    const Range<1,int> range(imin, imax+1, 32);
-    auto kernel = [&](const Range<1,int> &r) {
-        auto acc = grid->getWriteAccessor();
-        Coord ijk;
-        int &i = ijk[0], &j = ijk[1], &k = ijk[2], m = 1;
-        // Compute signed distances to torus using leapfrogging in k
-        for (i = r.begin(); i < r.end(); ++i) {
-            const auto x2 = Pow2(ValueT(i) - c[0]);
-            for (k = kmin; k <= kmax; ++k) {
-                const auto x2z2 = Pow2(Sqrt(Pow2(ValueT(k) - c[2]) + x2) - r1);
-                for (j = jmin; j <= jmax; j += m) {
-                    m = 1;
-                    const auto v = Sqrt(x2z2 + Pow2(ValueT(j) - c[1])) - r2; // Distance in voxel units
-                    const auto d = v < 0 ? -v : v;
-                    if (d < halfWidth) { // inside narrow band
-                        acc.setValue(ijk, ValueT(voxelSize) * v); // distance in world units
-                    } else { // outside narrow band
-                        m += Floor(d - halfWidth); // leapfrog
-                    }
-                } //end leapfrog over k
-            } //end loop over j
-        } //end loop over i
-     }; // kernel
-
-#ifdef NANOVDB_PARALLEL_PRIMITIVES
-    forEach(range, kernel);
-#else
-    kernel(range);
-#endif
-
-    return grid;
-} // initTorus
-
-template<typename BuildT>
-std::shared_ptr<build::Grid<BuildT>>
-initBox(double       width, // major radius of torus in world units
-        double       height, // minor radius of torus in world units
-        double       depth,
-        const Vec3d& center, // center of box in world units
-        double       voxelSize, // size of a voxel in world units
-        double       halfWidth, // half-width of narrow band in voxel units
-        const Vec3d& origin) // origin of grid in world units
-{
-    using GridT = build::Grid<BuildT>;
-    using ValueT = typename BuildToValueMap<BuildT>::type;
-    static_assert(is_floating_point<ValueT>::value, "initBox: expect floating point");
-    using Vec3T = Vec3<ValueT>;
-    if (!(width > 0))
-        throw std::runtime_error("Box: width must be positive!");
-    if (!(height > 0))
-        throw std::runtime_error("Box: height must be positive!");
-    if (!(depth > 0))
-        throw std::runtime_error("Box: depth must be positive!");
-
-    if (!(voxelSize > 0))
-        throw std::runtime_error("Box: voxelSize must be positive!");
-    if (!(halfWidth > 0))
-        throw std::runtime_error("Box: halfWidth must be positive!");
-
-    auto grid = std::make_shared<GridT>(ValueT(halfWidth * voxelSize));
-    grid->setTransform(voxelSize, origin);
-
-    // Define size of box with narrow-band in voxel units
-    const Vec3T r(width  / (2 * ValueT(voxelSize)),
-                  height / (2 * ValueT(voxelSize)),
-                  depth  / (2 * ValueT(voxelSize)));
-
-    // Below the Nyquist frequency
-    if (r.min() < ValueT(1.5)) return grid;
-
-    // Define center of box in voxel units
-    const Vec3T c(ValueT(center[0] - origin[0]) / ValueT(voxelSize),
-                  ValueT(center[1] - origin[1]) / ValueT(voxelSize),
-                  ValueT(center[2] - origin[2]) / ValueT(voxelSize));
-
-    // Define utility functions
-    auto Pos = [](ValueT x) { return x > 0 ? x : 0; };
-    auto Neg = [](ValueT x) { return x < 0 ? x : 0; };
-
-    // Define bounds of the voxel coordinates
-    const BBox<Vec3T> b(c - r - Vec3T(ValueT(halfWidth)), c + r + Vec3T(ValueT(halfWidth)));
-    const CoordBBox   bbox(Coord(Floor(b[0][0]), Floor(b[0][1]), Floor(b[0][2])),
-                           Coord( Ceil(b[1][0]),  Ceil(b[1][1]),  Ceil(b[1][2])));
-    const Range<1,int> range(bbox[0][0], bbox[1][0]+1, 32);
-
-    // Compute signed distances to box using leapfrogging in k
-    auto kernel = [&](const Range<1,int> &ra) {
-        auto acc = grid->getWriteAccessor();
-        int m = 1;
-        for (Coord p(ra.begin(),bbox[0][1],bbox[0][2]); p[0] < ra.end(); ++p[0]) {
-            const auto q1 = Abs(ValueT(p[0]) - c[0]) - r[0];
-            const auto x2 = Pow2(Pos(q1));
-            for (p[1] = bbox[0][1]; p[1] <= bbox[1][1]; ++p[1]) {
-                const auto q2 = Abs(ValueT(p[1]) - c[1]) - r[1];
-                const auto q0 = Max(q1, q2);
-                const auto x2y2 = x2 + Pow2(Pos(q2));
-                for (p[2] = bbox[0][2]; p[2] <= bbox[1][2]; p[2] += m) {
-                    m = 1;
-                    const auto q3 = Abs(ValueT(p[2]) - c[2]) - r[2];
-                    const auto v = Sqrt(x2y2 + Pow2(Pos(q3))) + Neg(Max(q0, q3)); // Distance in voxel units
-                    const auto d = Abs(v);
-                    if (d < halfWidth) { // inside narrow band
-                        acc.setValue(p, ValueT(voxelSize) * v); // distance in world units
-                    } else { // outside narrow band
-                        m += Floor(d - halfWidth); // leapfrog
-                    }
-                } //end leapfrog over k
-            } //end loop over j
-        } //end loop over i
-    }; // kernel
-#ifdef NANOVDB_PARALLEL_PRIMITIVES
-    forEach(range, kernel);
-#else
-    kernel(range);
-#endif
-    return grid;
-} // initBox
-
-template<typename BuildT>
-std::shared_ptr<build::Grid<BuildT>>
-initBBox(double       width, // width of the bbox in world units
-         double       height, // height of the bbox in world units
-         double       depth, // depth of the bbox in world units
-         double       thickness, // thickness of the wire in world units
-         const Vec3d& center, // center of bbox in world units
-         double       voxelSize, // size of a voxel in world units
-         double       halfWidth, // half-width of narrow band in voxel units
-         const Vec3d& origin) // origin of grid in world units
-{
-    using GridT = build::Grid<BuildT>;
-    using ValueT = typename BuildToValueMap<BuildT>::type;
-    static_assert(is_floating_point<ValueT>::value, "initBBox: expect floating point");
-    using Vec3T = Vec3<ValueT>;
-    if (!(width > 0))
-        throw std::runtime_error("BBox: width must be positive!");
-    if (!(height > 0))
-        throw std::runtime_error("BBox: height must be positive!");
-    if (!(depth > 0))
-        throw std::runtime_error("BBox: depth must be positive!");
-    if (!(thickness > 0))
-        throw std::runtime_error("BBox: thickness must be positive!");
-    if (!(voxelSize > 0.0))
-        throw std::runtime_error("BBox: voxelSize must be positive!");
-
-
-    auto grid = std::make_shared<GridT>(ValueT(halfWidth * voxelSize));
-    grid->setTransform(voxelSize, origin);
-
-    // Define size of bbox with narrow-band in voxel units
-    const Vec3T  r(width / (2 * ValueT(voxelSize)),
-                  height / (2 * ValueT(voxelSize)),
-                  depth  / (2 * ValueT(voxelSize)));
-    const ValueT e = thickness / ValueT(voxelSize);
-
-    // Below the Nyquist frequency
-    if (r.min() < ValueT(1.5) || e < ValueT(1.5)) return grid;
-
-    // Define center of bbox in voxel units
-    const Vec3T c(ValueT(center[0] - origin[0]) / ValueT(voxelSize),
-                  ValueT(center[1] - origin[1]) / ValueT(voxelSize),
-                  ValueT(center[2] - origin[2]) / ValueT(voxelSize));
-
-    // Define utility functions
-    auto Pos = [](ValueT x) { return x > 0 ? x : 0; };
-    auto Neg = [](ValueT x) { return x < 0 ? x : 0; };
-
-    // Define bounds of the voxel coordinates
-    const BBox<Vec3T> b(c - r - Vec3T(e + ValueT(halfWidth)), c + r + Vec3T(e + ValueT(halfWidth)));
-    const CoordBBox   bbox(Coord(Floor(b[0][0]), Floor(b[0][1]), Floor(b[0][2])),
-                           Coord( Ceil(b[1][0]),  Ceil(b[1][1]),  Ceil(b[1][2])));
-    const Range<1,int> range(bbox[0][0], bbox[1][0]+1, 32);
-
-    // Compute signed distances to bbox using leapfrogging in k
-    auto kernel = [&](const Range<1,int> &ra) {
-        auto acc = grid->getWriteAccessor();
-        int m = 1;
-        for (Coord p(ra.begin(),bbox[0][1],bbox[0][2]); p[0] < ra.end(); ++p[0]) {
-            const ValueT px = Abs(ValueT(p[0]) - c[0]) - r[0];
-            const ValueT qx = Abs(ValueT(px) + e) - e;
-            const ValueT px2 = Pow2(Pos(px));
-            const ValueT qx2 = Pow2(Pos(qx));
-            for (p[1] = bbox[0][1]; p[1] <= bbox[1][1]; ++p[1]) {
-                const ValueT py = Abs(ValueT(p[1]) - c[1]) - r[1];
-                const ValueT qy = Abs(ValueT(py) + e) - e;
-                const ValueT qy2 = Pow2(Pos(qy));
-                const ValueT px2qy2 = px2 + qy2;
-                const ValueT qx2py2 = qx2 + Pow2(Pos(py));
-                const ValueT qx2qy2 = qx2 + qy2;
-                const ValueT a[3] = {Max(px, qy), Max(qx, py), Max(qx, qy)};
-                for (p[2] = bbox[0][2]; p[2] <= bbox[1][2]; p[2] += m) {
-                    m = 1;
-                    const ValueT pz = Abs(ValueT(p[2]) - c[2]) - r[2];
-                    const ValueT qz = Abs(ValueT(pz) + e) - e;
-                    const ValueT qz2 = Pow2(Pos(qz));
-                    const ValueT s1 = Sqrt(px2qy2 + qz2) + Neg(Max(a[0], qz));
-                    const ValueT s2 = Sqrt(qx2py2 + qz2) + Neg(Max(a[1], qz));
-                    const ValueT s3 = Sqrt(qx2qy2 + Pow2(Pos(pz))) + Neg(Max(a[2], pz));
-                    const ValueT v = Min(s1, Min(s2, s3)); // Distance in voxel units
-                    const ValueT d = Abs(v);
-                    if (d < halfWidth) { // inside narrow band
-                        acc.setValue(p, ValueT(voxelSize) * v); // distance in world units
-                    } else { // outside narrow band
-                        m += Floor(d - halfWidth); // leapfrog
-                    }
-                } //end leapfrog over k
-            } //end loop over j
-        } //end loop over i
-    }; //kernel
-#ifdef NANOVDB_PARALLEL_PRIMITIVES
-    forEach(range, kernel);
-#else
-    kernel(range);
-#endif
-
-    return grid;
-} // initBBox
-
-template<typename BuildT>
-std::shared_ptr<build::Grid<BuildT>>
-initOctahedron(double       scale, // scale of the octahedron in world units
-               const Vec3d& center, // center of octahedron in world units
-               double       voxelSize, // size of a voxel in world units
-               double       halfWidth, // half-width of narrow band in voxel units
-               const Vec3d& origin) // origin of grid in world units
-{
-    using GridT = build::Grid<BuildT>;
-    using ValueT = typename BuildToValueMap<BuildT>::type;
-    using Vec3T = Vec3<ValueT>;
-    static_assert(is_floating_point<ValueT>::value, "initOctahedron: expect floating point");
-
-    if (!(scale > 0)) throw std::runtime_error("Octahedron: width must be positive!");
-    if (!(voxelSize > 0)) throw std::runtime_error("Octahedron: voxelSize must be positive!");
-
-    auto grid = std::make_shared<GridT>(ValueT(halfWidth * voxelSize));
-    grid->setTransform(voxelSize, origin);
-
-    // Define size of octahedron with narrow-band in voxel units
-    const ValueT s = scale / (2 * ValueT(voxelSize));
-
-    // Below the Nyquist frequency
-    if ( s < ValueT(1.5) ) return grid;
-
-    // Define center of octahedron in voxel units
-    const Vec3T c(ValueT(center[0] - origin[0]) / ValueT(voxelSize),
-                  ValueT(center[1] - origin[1]) / ValueT(voxelSize),
-                  ValueT(center[2] - origin[2]) / ValueT(voxelSize));
-
-    // Define utility functions
-    auto sdf = [&s](ValueT x, ValueT y, ValueT z) {
-        const ValueT d = ValueT(0.5)*(z - y + s);
-        if (d < ValueT(0)) {
-            return Vec3T(x, y - s, z).length();
-        } else if (d > s) {
-            return Vec3T(x, y, z - s).length();
-        }
-        return Vec3T(x, y - s + d, z - d).length();
-    };
-
-    // Define bounds of the voxel coordinates
-    const BBox<Vec3T> b(c - Vec3T(s + ValueT(halfWidth)), c + Vec3T(s + ValueT(halfWidth)));
-    const CoordBBox   bbox(Coord(Floor(b[0][0]), Floor(b[0][1]), Floor(b[0][2])),
-                           Coord( Ceil(b[1][0]),  Ceil(b[1][1]),  Ceil(b[1][2])));
-    const Range<1,int> range(bbox[0][0], bbox[1][0]+1, 32);
-
-    // Compute signed distances to octahedron using leapfrogging in k
-    auto kernel = [&](const Range<1,int> &ra) {
-        auto acc = grid->getWriteAccessor();
-        int m = 1;
-        static const ValueT a = Sqrt(ValueT(1)/ValueT(3));
-        for (Coord p(ra.begin(),bbox[0][1],bbox[0][2]); p[0] < ra.end(); ++p[0]) {
-            const ValueT px = Abs(ValueT(p[0]) - c[0]);
-            for (p[1] = bbox[0][1]; p[1] <= bbox[1][1]; ++p[1]) {
-                const ValueT py = Abs(ValueT(p[1]) - c[1]);
-                for (p[2] = bbox[0][2]; p[2] <= bbox[1][2]; p[2] += m) {
-                    m = 1;
-                    const ValueT pz = Abs(ValueT(p[2]) - c[2]);
-                    ValueT d =  px + py + pz - s;
-                    ValueT v;
-                    if (ValueT(3)*px < d) {
-                        v = sdf(px, py, pz);
-                    } else if (ValueT(3)*py < d) {
-                        v = sdf(py, pz, px);
-                    } else if (ValueT(3)*pz < d) {
-                        v = sdf(pz, px, py);
-                    } else {
-                        v = a * d;
-                    }
-                    d = Abs(v);
-                    if (d < halfWidth) { // inside narrow band
-                        acc.setValue(p, ValueT(voxelSize) * v); // distance in world units
-                    } else { // outside narrow band
-                        m += Floor(d - halfWidth); // leapfrog
-                    }
-                } //end leapfrog over k
-            } //end loop over j
-        } //end loop over i
-     };// kernel
-#ifdef NANOVDB_PARALLEL_PRIMITIVES
-    forEach(range, kernel);
-#else
-    kernel(range);
-#endif
-    return grid;
-} // initOctahedron
-
-} // unnamed namespace
-
-//================================================================================================
-
-template<typename BuildT, typename BufferT>
-typename enable_if<is_same<float,  BuildT>::value ||
-                   is_same<double, BuildT>::value, GridHandle<BufferT>>::type
-createLevelSetSphere(double              radius, // radius of sphere in world units
-                     const Vec3d&        center, // center of sphere in world units
-                     double              voxelSize, // size of a voxel in world units
-                     double              halfWidth, // half-width of narrow band in voxel units
-                     const Vec3d&        origin, // origin of grid in world units
-                     const std::string&  name, // name of grid
-                     StatsMode           sMode, // mode of computation for the statistics
-                     ChecksumMode        cMode, // mode of computation for the checksum
-                     const BufferT&      buffer)
-{
-    using GridT = build::Grid<BuildT>;
-    auto grid = initSphere<BuildT>(radius, center, voxelSize, halfWidth, origin);
-    grid->mName = name;
-    build::NodeManager<GridT> mgr(*grid);
-    build::sdfToLevelSet(mgr);
-    CreateNanoGrid<GridT> converter(*grid);
-    converter.setStats(sMode);
-    converter.setChecksum(cMode);
-    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
-    assert(handle);
-    return handle;
-} // createLevelSetSphere<T>
-
-//================================================================================================
-
-template<typename BuildT, typename BufferT>
-typename enable_if<is_same<Fp4,  BuildT>::value ||
-                   is_same<Fp8,  BuildT>::value ||
-                   is_same<Fp16, BuildT>::value, GridHandle<BufferT>>::type
-createLevelSetSphere(double              radius, // radius of sphere in world units
-                     const Vec3d&        center, // center of sphere in world units
-                     double              voxelSize, // size of a voxel in world units
-                     double              halfWidth, // half-width of narrow band in voxel units
-                     const Vec3d&        origin, // origin of grid in world units
-                     const std::string&  name, // name of grid
-                     StatsMode           sMode, // mode of computation for the statistics
-                     ChecksumMode        cMode, // mode of computation for the checksum
-                     bool                ditherOn,
-                     const BufferT&      buffer)
-{
-    using GridT = build::Grid<BuildT>;
-    auto grid = initSphere<BuildT>(radius, center, voxelSize, halfWidth, origin);
-    grid->mName = name;
-    build::NodeManager<GridT> mgr(*grid);
-    build::sdfToLevelSet(mgr);
-    CreateNanoGrid<GridT> converter(*grid);
-    converter.setStats(sMode);
-    converter.setChecksum(cMode);
-    converter.enableDithering(ditherOn);
-    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
-    assert(handle);
-    return handle;
-} // createLevelSetSphere<Fp4 or Fp8 or Fp16>
-
-//================================================================================================
-
-template<typename BuildT, typename BufferT>
-typename enable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createLevelSetSphere(double              radius, // radius of sphere in world units
-                     const Vec3d&        center, // center of sphere in world units
-                     double              voxelSize, // size of a voxel in world units
-                     double              halfWidth, // half-width of narrow band in voxel units
-                     const Vec3d&        origin, // origin of grid in world units
-                     const std::string&  name, // name of grid
-                     StatsMode           sMode, // mode of computation for the statistics
-                     ChecksumMode        cMode, // mode of computation for the checksum
-                     float               tolerance,// only used if VoxelT = FpN
-                     bool                ditherOn,
-                     const BufferT&      buffer)
-{
-    using GridT = build::Grid<BuildT>;
-    auto grid = initSphere<BuildT>(radius, center, voxelSize, halfWidth, origin);
-    grid->mName = name;
-    build::NodeManager<GridT> mgr(*grid);
-    build::sdfToLevelSet(mgr);
-    CreateNanoGrid<GridT> converter(*grid);
-    converter.setStats(sMode);
-    converter.setChecksum(cMode);
-    converter.enableDithering(ditherOn);
-    AbsDiff oracle(tolerance);
-    auto handle = converter.template getHandle<BuildT, AbsDiff, BufferT>(oracle, buffer);
-    assert(handle);
-    return handle;
-} // createLevelSetSphere<FpN>
-
-//================================================================================================
-
-template<typename BuildT, typename BufferT>
-typename disable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createFogVolumeSphere(double              radius, // radius of sphere in world units
-                      const Vec3d&        center, // center of sphere in world units
-                      double              voxelSize, // size of a voxel in world units
-                      double              halfWidth, // half-width of narrow band in voxel units
-                      const Vec3d&        origin, // origin of grid in world units
-                      const std::string&  name, // name of grid
-                      StatsMode           sMode, // mode of computation for the statistics
-                      ChecksumMode        cMode, // mode of computation for the checksum
-                      const BufferT&      buffer)
-{
-    using GridT = build::Grid<BuildT>;
-    auto grid = initSphere<BuildT>(radius, center, voxelSize, halfWidth, origin);
-    grid->mName = name;
-    build::NodeManager<GridT> mgr(*grid);
-    build::sdfToLevelSet(mgr);
-    build::levelSetToFog(mgr, false);
-    CreateNanoGrid<GridT> converter(*grid);
-    converter.setStats(sMode);
-    converter.setChecksum(cMode);
-    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
-    assert(handle);
-    return handle;
-} // createFogVolumeSphere<T>
-
-//================================================================================================
-
-template<typename BuildT, typename BufferT>
-typename enable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createFogVolumeSphere(double              radius, // radius of sphere in world units
-                      const Vec3d&        center, // center of sphere in world units
-                      double              voxelSize, // size of a voxel in world units
-                      double              halfWidth, // half-width of narrow band in voxel units
-                      const Vec3d&        origin, // origin of grid in world units
-                      const std::string&  name, // name of grid
-                      StatsMode           sMode, // mode of computation for the statistics
-                      ChecksumMode        cMode, // mode of computation for the checksum
-                      float               tolerance,// only used if VoxelT = FpN
-                      bool                ditherOn,
-                      const BufferT&      buffer)
-{
-    using GridT = build::Grid<BuildT>;
-    auto grid = initSphere<BuildT>(radius, center, voxelSize, halfWidth, origin);
-    grid->mName = name;
-    build::NodeManager<GridT> mgr(*grid);
-    build::sdfToLevelSet(mgr);
-    build::levelSetToFog(mgr, false);
-    CreateNanoGrid<GridT> converter(*grid);
-    converter.setStats(sMode);
-    converter.setChecksum(cMode);
-    converter.enableDithering(ditherOn);
-    AbsDiff oracle(tolerance);
-    auto handle = converter.template getHandle<BuildT, AbsDiff, BufferT>(oracle, buffer);
-    assert(handle);
-    return handle;
-} // createFogVolumeSphere<FpN>
-
-//================================================================================================
-
-template<typename BuildT, typename BufferT>
-typename disable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createPointSphere(int                 pointsPerVoxel, // number of points to be scattered in each active voxel
-                  double              radius, // radius of sphere in world units
-                  const Vec3d&        center, // center of sphere in world units
-                  double              voxelSize, // size of a voxel in world units
-                  const Vec3d&        origin, // origin of grid in world units
-                  const std::string&  name, // name of grid
-                  ChecksumMode        cMode, // mode of computation for the checksum
-                  const BufferT&      buffer)
-{
-    auto sphereHandle = createLevelSetSphere(radius, center, voxelSize, 0.5, origin, "dummy",
-                                             StatsMode::BBox, ChecksumMode::Disable, buffer);
-    assert(sphereHandle);
-    auto* sphereGrid = sphereHandle.template grid<BuildT>();
-    assert(sphereGrid);
-    auto pointHandle = createPointScatter(*sphereGrid, pointsPerVoxel, name, cMode, buffer);
-    assert(pointHandle);
-    return pointHandle;
-} // createPointSphere
-
-//================================================================================================
-
-template<typename BuildT, typename BufferT>
-typename disable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createLevelSetTorus(double              majorRadius, // major radius of torus in world units
-                    double              minorRadius, // minor radius of torus in world units
-                    const Vec3d&        center, // center of torus in world units
-                    double              voxelSize, // size of a voxel in world units
-                    double              halfWidth, // half-width of narrow band in voxel units
-                    const Vec3d&        origin, // origin of grid in world units
-                    const std::string&  name, // name of grid
-                    StatsMode           sMode, // mode of computation for the statistics
-                    ChecksumMode        cMode, // mode of computation for the checksum
-                    const BufferT&      buffer)
-{
-    using GridT = build::Grid<BuildT>;
-    auto grid = initTorus<BuildT>(majorRadius, minorRadius, center, voxelSize, halfWidth, origin);
-    grid->mName = name;
-    build::NodeManager<GridT> mgr(*grid);
-    build::sdfToLevelSet(mgr);
-    CreateNanoGrid<GridT> converter(*grid);
-    converter.setStats(sMode);
-    converter.setChecksum(cMode);
-    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
-    assert(handle);
-    return handle;
-} // createLevelSetTorus<T>
-
-//================================================================================================
-
-template<typename BuildT, typename BufferT>
-typename enable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createLevelSetTorus(double              majorRadius, // major radius of torus in world units
-                    double              minorRadius, // minor radius of torus in world units
-                    const Vec3d&        center, // center of torus in world units
-                    double              voxelSize, // size of a voxel in world units
-                    double              halfWidth, // half-width of narrow band in voxel units
-                    const Vec3d&        origin, // origin of grid in world units
-                    const std::string&  name, // name of grid
-                    StatsMode           sMode, // mode of computation for the statistics
-                    ChecksumMode        cMode, // mode of computation for the checksum
-                    float               tolerance,
-                    bool                ditherOn,
-                    const BufferT&      buffer)
-{
-    using GridT = build::Grid<BuildT>;
-    auto grid = initTorus<BuildT>(majorRadius, minorRadius, center, voxelSize, halfWidth, origin);
-    grid->mName = name;
-    build::NodeManager<GridT> mgr(*grid);
-    build::sdfToLevelSet(mgr);
-    CreateNanoGrid<GridT> converter(*grid);
-    converter.setStats(sMode);
-    converter.setChecksum(cMode);
-    converter.enableDithering(ditherOn);
-    AbsDiff oracle(tolerance);
-    auto handle = converter.template getHandle<BuildT, AbsDiff, BufferT>(oracle, buffer);
-    assert(handle);
-    return handle;
-} // createLevelSetTorus<FpN>
-
-//================================================================================================
-
-template<typename BuildT, typename BufferT>
-typename disable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createFogVolumeTorus(double              majorRadius, // major radius of torus in world units
-                     double              minorRadius, // minor radius of torus in world units
-                     const Vec3d&        center, // center of torus in world units
-                     double              voxelSize, // size of a voxel in world units
-                     double              halfWidth, // half-width of narrow band in voxel units
-                     const Vec3d&        origin, // origin of grid in world units
-                     const std::string&  name, // name of grid
-                     StatsMode           sMode, // mode of computation for the statistics
-                     ChecksumMode        cMode, // mode of computation for the checksum
-                     const BufferT&      buffer)
-{
-    using GridT = build::Grid<BuildT>;
-    auto grid = initTorus<BuildT>(majorRadius, minorRadius, center, voxelSize, halfWidth, origin);
-    grid->mName = name;
-    build::NodeManager<GridT> mgr(*grid);
-    build::sdfToLevelSet(mgr);
-    build::levelSetToFog(mgr, false);
-    CreateNanoGrid<GridT> converter(*grid);
-    converter.setStats(sMode);
-    converter.setChecksum(cMode);
-    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
-    assert(handle);
-    return handle;
-} // createFogVolumeTorus<T>
-
-//================================================================================================
-
-template<typename BuildT, typename BufferT>
-typename enable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createFogVolumeTorus(double              majorRadius, // major radius of torus in world units
-                     double              minorRadius, // minor radius of torus in world units
-                     const Vec3d&        center, // center of torus in world units
-                     double              voxelSize, // size of a voxel in world units
-                     double              halfWidth, // half-width of narrow band in voxel units
-                     const Vec3d&        origin, // origin of grid in world units
-                     const std::string&  name, // name of grid
-                     StatsMode           sMode, // mode of computation for the statistics
-                     ChecksumMode        cMode, // mode of computation for the checksum
-                     float               tolerance,
-                     bool                ditherOn,
-                     const BufferT&      buffer)
-{
-    using GridT = build::Grid<BuildT>;
-    auto grid = initTorus<BuildT>(majorRadius, minorRadius, center, voxelSize, halfWidth, origin);
-    grid->mName = name;
-    build::NodeManager<GridT> mgr(*grid);
-    build::sdfToLevelSet(mgr);
-    build::levelSetToFog(mgr, false);
-    CreateNanoGrid<GridT> converter(*grid);
-    converter.setStats(sMode);
-    converter.setChecksum(cMode);
-    converter.enableDithering(ditherOn);
-    AbsDiff oracle(tolerance);
-    auto handle = converter.template getHandle<BuildT, AbsDiff, BufferT>(oracle, buffer);
-    assert(handle);
-    return handle;
-} // createFogVolumeTorus<FpN>
-
-//================================================================================================
-
-template<typename BuildT, typename BufferT>
-typename disable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createPointTorus(int                 pointsPerVoxel, // number of points to be scattered in each active voxel
-                 double              majorRadius, // major radius of torus in world units
-                 double              minorRadius, // minor radius of torus in world units
-                 const Vec3d&        center, // center of torus in world units
-                 double              voxelSize, // size of a voxel in world units
-                 const Vec3d&        origin, // origin of grid in world units
-                 const std::string&  name, // name of grid
-                 ChecksumMode        cMode, // mode of computation for the checksum
-                 const BufferT&      buffer)
-{
-    auto torusHandle = createLevelSetTorus(majorRadius, minorRadius, center, voxelSize, 0.5f, origin,
-                                           "dummy", StatsMode::BBox, ChecksumMode::Disable, buffer);
-    assert(torusHandle);
-    auto* torusGrid = torusHandle.template grid<BuildT>();
-    assert(torusGrid);
-    auto pointHandle = createPointScatter(*torusGrid, pointsPerVoxel, name, cMode, buffer);
-    assert(pointHandle);
-    return pointHandle;
-} // createPointTorus<T>
-
-//================================================================================================
-
-template<typename BuildT, typename BufferT>
-typename disable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createLevelSetBox(double              width, // width of box in world units
-                  double              height, // height of box in world units
-                  double              depth, // depth of box in world units
-                  const Vec3d&        center, // center of box in world units
-                  double              voxelSize, // size of a voxel in world units
-                  double              halfWidth, // half-width of narrow band in voxel units
-                  const Vec3d&        origin, // origin of grid in world units
-                  const std::string&  name, // name of grid
-                  StatsMode           sMode, // mode of computation for the statistics
-                  ChecksumMode        cMode, // mode of computation for the checksum
-                  const BufferT&      buffer)
-{
-    using GridT = build::Grid<BuildT>;
-    auto grid = initBox<BuildT>(width, height, depth, center, voxelSize, halfWidth, origin);
-    grid->mName = name;
-    build::NodeManager<GridT> mgr(*grid);
-    build::sdfToLevelSet(mgr);
-    CreateNanoGrid<GridT> converter(*grid);
-    converter.setStats(sMode);
-    converter.setChecksum(cMode);
-    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
-    assert(handle);
-    return handle;
-} // createLevelSetBox<T>
-
-//================================================================================================
-
-template<typename BuildT, typename BufferT>
-typename enable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createLevelSetBox(double              width, // width of box in world units
-                  double              height, // height of box in world units
-                  double              depth, // depth of box in world units
-                  const Vec3d&        center, // center of box in world units
-                  double              voxelSize, // size of a voxel in world units
-                  double              halfWidth, // half-width of narrow band in voxel units
-                  const Vec3d&        origin, // origin of grid in world units
-                  const std::string&  name, // name of grid
-                  StatsMode           sMode, // mode of computation for the statistics
-                  ChecksumMode        cMode, // mode of computation for the checksum
-                  float               tolerance,
-                  bool                ditherOn,
-                  const BufferT&      buffer)
-{
-    using GridT = build::Grid<BuildT>;
-    auto grid = initBox<BuildT>(width, height, depth, center, voxelSize, halfWidth, origin);
-    grid->mName = name;
-    build::NodeManager<GridT> mgr(*grid);
-    build::sdfToLevelSet(mgr);
-    CreateNanoGrid<GridT> converter(*grid);
-    converter.setStats(sMode);
-    converter.setChecksum(cMode);
-    converter.enableDithering(ditherOn);
-    AbsDiff oracle(tolerance);
-    auto handle = converter.template getHandle<BuildT, AbsDiff, BufferT>(oracle, buffer);
-    assert(handle);
-    return handle;
-} // createLevelSetBox<FpN>
-
-//================================================================================================
-
-template<typename BuildT, typename BufferT>
-typename disable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createLevelSetOctahedron(double              scale, // scale of the octahedron in world units
-                         const Vec3d&        center, // center of box in world units
-                         double              voxelSize, // size of a voxel in world units
-                         double              halfWidth, // half-width of narrow band in voxel units
-                         const Vec3d&        origin, // origin of grid in world units
-                         const std::string&  name, // name of grid
-                         StatsMode           sMode, // mode of computation for the statistics
-                         ChecksumMode        cMode, // mode of computation for the checksum
-                         const BufferT&      buffer)
-{
-    using GridT = build::Grid<BuildT>;
-    auto grid = initOctahedron<BuildT>(scale, center, voxelSize, halfWidth, origin);
-    grid->mName = name;
-    build::NodeManager<GridT> mgr(*grid);
-    build::sdfToLevelSet(mgr);
-    CreateNanoGrid<GridT> converter(*grid);
-    converter.setStats(sMode);
-    converter.setChecksum(cMode);
-    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
-    assert(handle);
-    return handle;
-} // createLevelSetOctahedron<T>
-
-//================================================================================================
-
-template<typename BuildT, typename BufferT>
-typename enable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createLevelSetOctahedron(double              scale, // scale of the octahedron in world units
-                         const Vec3d&        center, // center of box in world units
-                         double              voxelSize, // size of a voxel in world units
-                         double              halfWidth, // half-width of narrow band in voxel units
-                         const Vec3d&        origin, // origin of grid in world units
-                         const std::string&  name, // name of grid
-                         StatsMode           sMode, // mode of computation for the statistics
-                         ChecksumMode        cMode, // mode of computation for the checksum
-                         float               tolerance,
-                         bool                ditherOn,
-                         const BufferT&      buffer)
-{
-    using GridT = build::Grid<BuildT>;
-    auto grid = initOctahedron<BuildT>(scale, center, voxelSize, halfWidth, origin);
-    grid->mName = name;
-    build::NodeManager<GridT> mgr(*grid);
-    build::sdfToLevelSet(mgr);
-    CreateNanoGrid<GridT> converter(*grid);
-    converter.setStats(sMode);
-    converter.setChecksum(cMode);
-    converter.enableDithering(ditherOn);
-    AbsDiff oracle(tolerance);
-    auto handle = converter.template getHandle<BuildT, AbsDiff, BufferT>(oracle, buffer);
-    assert(handle);
-    return handle;
-} // createLevelSetOctahedron<FpN>
-
-//================================================================================================
-
-template<typename BuildT, typename BufferT>
-typename disable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createLevelSetBBox(double              width, // width of bbox in world units
-                   double              height, // height of bbox in world units
-                   double              depth, // depth of bbox in world units
-                   double              thickness, // thickness of the wire in world units
-                   const Vec3d&        center, // center of bbox in world units
-                   double              voxelSize, // size of a voxel in world units
-                   double              halfWidth, // half-width of narrow band in voxel units
-                   const Vec3d&        origin, // origin of grid in world units
-                   const std::string&  name, // name of grid
-                   StatsMode           sMode, // mode of computation for the statistics
-                   ChecksumMode        cMode, // mode of computation for the checksum
-                   const BufferT&      buffer)
-{
-    using GridT = build::Grid<BuildT>;
-    auto grid = initBBox<BuildT>(width, height, depth, thickness, center, voxelSize, halfWidth, origin);
-    grid->mName = name;
-    build::NodeManager<GridT> mgr(*grid);
-    build::sdfToLevelSet(mgr);
-    CreateNanoGrid<GridT> converter(*grid);
-    converter.setStats(sMode);
-    converter.setChecksum(cMode);
-    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
-    assert(handle);
-    return handle;
-} // createLevelSetBBox<T>
-
-//================================================================================================
-
-template<typename BuildT, typename BufferT>
-typename enable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createLevelSetBBox(double              width, // width of bbox in world units
-                   double              height, // height of bbox in world units
-                   double              depth, // depth of bbox in world units
-                   double              thickness, // thickness of the wire in world units
-                   const Vec3d&        center, // center of bbox in world units
-                   double              voxelSize, // size of a voxel in world units
-                   double              halfWidth, // half-width of narrow band in voxel units
-                   const Vec3d&        origin, // origin of grid in world units
-                   const std::string&  name, // name of grid
-                   StatsMode           sMode, // mode of computation for the statistics
-                   ChecksumMode        cMode, // mode of computation for the checksum
-                   float               tolerance,
-                   bool                ditherOn,
-                   const BufferT&      buffer)
-{
-    using GridT = build::Grid<BuildT>;
-    auto grid = initBBox<BuildT>(width, height, depth, thickness, center, voxelSize, halfWidth, origin);
-    grid->mName = name;
-    build::NodeManager<GridT> mgr(*grid);
-    build::sdfToLevelSet(mgr);
-    CreateNanoGrid<GridT> converter(*grid);
-    converter.setStats(sMode);
-    converter.setChecksum(cMode);
-    converter.enableDithering(ditherOn);
-    AbsDiff oracle(tolerance);
-    auto handle = converter.template getHandle<BuildT, AbsDiff, BufferT>(oracle, buffer);
-    assert(handle);
-    return handle;
-} // createLevelSetBBox<FpN>
-
-//================================================================================================
-
-template<typename BuildT, typename BufferT>
-typename disable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createFogVolumeBox(double              width, // width of box in world units
-                   double              height, // height of box in world units
-                   double              depth, // depth of box in world units
-                   const Vec3d&        center, // center of box in world units
-                   double              voxelSize, // size of a voxel in world units
-                   double              halfWidth, // half-width of narrow band in voxel units
-                   const Vec3d&        origin, // origin of grid in world units
-                   const std::string&  name, // name of grid
-                   StatsMode           sMode, // mode of computation for the statistics
-                   ChecksumMode        cMode, // mode of computation for the checksum
-                   const BufferT&      buffer)
-{
-    using GridT = build::Grid<BuildT>;
-    auto grid = initBox<BuildT>(width, height, depth, center, voxelSize, halfWidth, origin);
-    grid->mName = name;
-    build::NodeManager<GridT> mgr(*grid);
-    build::sdfToLevelSet(mgr);
-    build::levelSetToFog(mgr, false);
-    CreateNanoGrid<GridT> converter(*grid);
-    converter.setStats(sMode);
-    converter.setChecksum(cMode);
-    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
-    assert(handle);
-    return handle;
-} // createFogVolumeBox<T>
-
-//================================================================================================
-
-template<typename BuildT, typename BufferT>
-typename enable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createFogVolumeBox(double              width, // width of box in world units
-                   double              height, // height of box in world units
-                   double              depth, // depth of box in world units
-                   const Vec3d&        center, // center of box in world units
-                   double              voxelSize, // size of a voxel in world units
-                   double              halfWidth, // half-width of narrow band in voxel units
-                   const Vec3d&        origin, // origin of grid in world units
-                   const std::string&  name, // name of grid
-                   StatsMode           sMode, // mode of computation for the statistics
-                   ChecksumMode        cMode, // mode of computation for the checksum
-                   float               tolerance,
-                   bool                ditherOn,
-                   const BufferT&      buffer)
-{
-    using GridT = build::Grid<BuildT>;
-    auto grid = initBox<BuildT>(width, height, depth, center, voxelSize, halfWidth, origin);
-    grid->mName = name;
-    build::NodeManager<GridT> mgr(*grid);
-    build::sdfToLevelSet(mgr);
-    build::levelSetToFog(mgr, false);
-    CreateNanoGrid<GridT> converter(*grid);
-    converter.setStats(sMode);
-    converter.setChecksum(cMode);
-    converter.enableDithering(ditherOn);
-    AbsDiff oracle(tolerance);
-    auto handle = converter.template getHandle<BuildT, AbsDiff, BufferT>(oracle, buffer);
-    assert(handle);
-    return handle;
-} // createFogVolumeBox<FpN>
-
-//================================================================================================
-
-template<typename BuildT, typename BufferT>
-typename disable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createFogVolumeOctahedron(double              scale, // scale of octahedron in world units
-                          const Vec3d&        center, // center of box in world units
-                          double              voxelSize, // size of a voxel in world units
-                          double              halfWidth, // half-width of narrow band in voxel units
-                          const Vec3d&        origin, // origin of grid in world units
-                          const std::string&  name, // name of grid
-                          StatsMode           sMode, // mode of computation for the statistics
-                          ChecksumMode        cMode, // mode of computation for the checksum
-                          const BufferT&      buffer)
-{
-    using GridT = build::Grid<BuildT>;
-    auto grid = initOctahedron<BuildT>(scale, center, voxelSize, halfWidth, origin);
-    grid->mName = name;
-    build::NodeManager<GridT> mgr(*grid);
-    build::sdfToLevelSet(mgr);
-    build::levelSetToFog(mgr, false);
-    CreateNanoGrid<GridT> converter(*grid);
-    converter.setStats(sMode);
-    converter.setChecksum(cMode);
-    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
-    assert(handle);
-    return handle;
-} // createFogVolumeOctahedron<T>
-
-//================================================================================================
-
-template<typename BuildT, typename BufferT>
-typename enable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createFogVolumeOctahedron(double              scale, // scale of octahedron in world units
-                          const Vec3d&        center, // center of box in world units
-                          double              voxelSize, // size of a voxel in world units
-                          double              halfWidth, // half-width of narrow band in voxel units
-                          const Vec3d&        origin, // origin of grid in world units
-                          const std::string&  name, // name of grid
-                          StatsMode           sMode, // mode of computation for the statistics
-                          ChecksumMode        cMode, // mode of computation for the checksum
-                          float               tolerance,
-                          bool                ditherOn,
-                          const BufferT&      buffer)
-{
-    using GridT = build::Grid<BuildT>;
-    auto grid = initOctahedron<BuildT>(scale, center, voxelSize, halfWidth, origin);
-    grid->mName = name;
-    build::NodeManager<GridT> mgr(*grid);
-    build::sdfToLevelSet(mgr);
-    build::levelSetToFog(mgr, false);
-    CreateNanoGrid<GridT> converter(*grid);
-    converter.setStats(sMode);
-    converter.setChecksum(cMode);
-    converter.enableDithering(ditherOn);
-    AbsDiff oracle(tolerance);
-    auto handle = converter.template getHandle<BuildT, AbsDiff, BufferT>(oracle, buffer);
-    assert(handle);
-    return handle;
-} // createFogVolumeOctahedron<FpN>
-
-//================================================================================================
-
-template<typename BuildT, typename BufferT>
-typename disable_if<is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
-createPointBox(int                 pointsPerVoxel, // number of points to be scattered in each active voxel
-               double              width, // width of box in world units
-               double              height, // height of box in world units
-               double              depth, // depth of box in world units
-               const Vec3d&        center, // center of box in world units
-               double              voxelSize, // size of a voxel in world units
-               const Vec3d&        origin, // origin of grid in world units
-               const std::string&  name, // name of grid
-               ChecksumMode        cMode, // mode of computation for the checksum
-               const BufferT&      buffer)
-{
-    auto boxHandle = createLevelSetBox(width, height, depth, center, voxelSize, 0.5, origin, "dummy",
-                                       StatsMode::BBox, ChecksumMode::Disable, buffer);
-    assert(boxHandle);
-    auto* boxGrid = boxHandle.template grid<BuildT>();
-    assert(boxGrid);
-    auto pointHandle = createPointScatter(*boxGrid, pointsPerVoxel, name, cMode, buffer);
-    assert(pointHandle);
-    return pointHandle;
-} // createPointBox<T>
-
-//================================================================================================
-
-template<typename SrcBuildT, typename BufferT>
-inline GridHandle<BufferT>
-createPointScatter(const NanoGrid<SrcBuildT>& srcGrid, // origin of grid in world units
-                   int                        pointsPerVoxel, // number of points to be scattered in each active voxel
-                   const std::string&         name, // name of grid
-                   ChecksumMode               cMode, // mode of computation for the checksum
-                   const BufferT&             buffer)
-{
-    using ValueT = typename BuildToValueMap<SrcBuildT>::type;
-    static_assert(is_floating_point<ValueT>::value, "createPointScatter: expect floating point");
-    using Vec3T = Vec3<ValueT>;
-    if (pointsPerVoxel < 1) {
-        throw std::runtime_error("createPointScatter: Expected at least one point per voxel");
-    }
-    if (!srcGrid.isLevelSet()) {
-        throw std::runtime_error("createPointScatter: Expected a level set grid");
-    }
-    if (!srcGrid.hasBBox()) {
-        throw std::runtime_error("createPointScatter: ActiveVoxelCount is required");
-    }
-    const uint64_t pointCount = pointsPerVoxel * srcGrid.activeVoxelCount();
-    if (pointCount == 0) {
-        throw std::runtime_error("createPointScatter: No particles to scatter");
-    }
-    std::vector<Vec3T> xyz;
-    xyz.reserve(pointCount);
-    using DstGridT = build::Grid<uint32_t>;
-    DstGridT dstGrid(std::numeric_limits<uint32_t>::max(), name, GridClass::PointData);
-    dstGrid.mMap = srcGrid.map();
-    auto dstAcc = dstGrid.getAccessor();
-    std::srand(1234);
-    const ValueT s = 1 / (1 + ValueT(RAND_MAX)); // scale so s*rand() is in ] 0, 1 [
-    // return a point with random local voxel coordinates (-0.5 to +0.5)
-    auto randomPoint = [&s](){return s * Vec3T(rand(), rand(), rand()) - Vec3T(0.5);};
-    const auto& srcTree = srcGrid.tree();
-    auto srcMgrHandle = createNodeManager(srcGrid);
-    auto *srcMgr = srcMgrHandle.template mgr<SrcBuildT>();
-    assert(srcMgr);
-    for (uint32_t i = 0, end = srcTree.nodeCount(0); i < end; ++i) {
-        auto& srcLeaf = srcMgr->leaf(i);
-        auto* dstLeaf = dstAcc.setValue(srcLeaf.origin(), pointsPerVoxel); // allocates leaf node
-        dstLeaf->mValueMask = srcLeaf.valueMask();
-        for (uint32_t j = 0, m = 0; j < 512; ++j) {
-            if (dstLeaf->mValueMask.isOn(j)) {
-                const Vec3f ijk = dstLeaf->offsetToGlobalCoord(j).asVec3s();// floating-point representatrion of index coorindates
-                for (int n = 0; n < pointsPerVoxel; ++n) xyz.push_back(srcGrid.indexToWorld(randomPoint() + ijk));
-                m += pointsPerVoxel;
-            }// active voxels
-            dstLeaf->mValues[j] = m;
-        }// loop over all voxels
-    }// loop over leaf nodes
-    assert(pointCount == xyz.size());
-    CreateNanoGrid<DstGridT> converter(dstGrid);
-    converter.setStats(StatsMode::MinMax);
-    converter.setChecksum(ChecksumMode::Disable);
-
-    converter.addBlindData(name,
-                           GridBlindDataSemantic::WorldCoords,
-                           GridBlindDataClass::AttributeArray,
-                           mapToGridType<Vec3T>(),
-                           pointCount,
-                           sizeof(Vec3T));
-    auto handle = converter.template getHandle<uint32_t>(buffer);
-    assert(handle);
-
-    auto* grid = handle.template grid<uint32_t>();
-    assert(grid && grid->template isSequential<0>());
-    auto &tree = grid->tree();
-    if (tree.nodeCount(0) == 0) throw std::runtime_error("Expect leaf nodes!");
-    auto *leafData = tree.getFirstLeaf()->data();
-    leafData[0].mMinimum = 0; // start of prefix sum
-    for (uint32_t i = 1, n = tree.nodeCount(0); i < n; ++i) {
-        leafData[i].mMinimum = leafData[i - 1].mMinimum + leafData[i - 1].mMaximum;
-    }
-    if (Vec3T *blindData = grid->template getBlindData<Vec3T>(0)) {
-        memcpy(blindData, xyz.data(), xyz.size() * sizeof(Vec3T));
-    } else {
-        throw std::runtime_error("Blind data pointer was NULL");
-    }
-    updateChecksum(*grid, cMode);
-    return handle;
-} // createPointScatter
-
-} // namespace nanovdb
-
-#endif // NANOVDB_PRIMITIVES_H_HAS_BEEN_INCLUDED
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/CreatePrimitives.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/tools/CreatePrimitives.h instead.")
diff --git a/nanovdb/nanovdb/util/Range.h b/nanovdb/nanovdb/util/Range.h
index 7b21b7ce94..c12873513e 100644
--- a/nanovdb/nanovdb/util/Range.h
+++ b/nanovdb/nanovdb/util/Range.h
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: MPL-2.0
 
 /*!
-    \file Range.h
+    \file nanovdb/util/Range.h
 
     \author Ken Museth
 
@@ -11,10 +11,11 @@
     \brief Custom Range class that is compatible with the tbb::blocked_range classes
 */
 
-#ifndef NANOVDB_RANGE_H_HAS_BEEN_INCLUDED
-#define NANOVDB_RANGE_H_HAS_BEEN_INCLUDED
+#ifndef NANOVDB_UTIL_RANGE_H_HAS_BEEN_INCLUDED
+#define NANOVDB_UTIL_RANGE_H_HAS_BEEN_INCLUDED
 
 #include <cassert>
+#include <cstddef>// for size_t
 
 #ifdef NANOVDB_USE_TBB
 #include <tbb/blocked_range.h>// for tbb::split
@@ -22,6 +23,8 @@
 
 namespace nanovdb {
 
+namespace util {
+
 class Split {};// Dummy class used by split constructors
 
 template <int, typename>
@@ -144,6 +147,12 @@ class Range<3, T>
     const Range<1, T>& operator[](int i) const { assert(i==0 || i==1 || i==2); return mRange[i]; }
 };// Range<3, T>
 
+}// namespace util
+
+using Range1D [[deprecated("Use nanovdb::util::Range1D instead")]] = util::Range<1, size_t>;
+using Range2D [[deprecated("Use nanovdb::util::Range2D instead")]] = util::Range<2, size_t>;
+using Range3D [[deprecated("Use nanovdb::util::Range3D instead")]] = util::Range<3, size_t>;
+
 }// namespace nanovdb
 
-#endif // NANOVDB_RANGE_H_HAS_BEEN_INCLUDED
+#endif // NANOVDB_UTIL_RANGE_H_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/util/Ray.h b/nanovdb/nanovdb/util/Ray.h
index 62d6ff51a0..1fed33bf7c 100644
--- a/nanovdb/nanovdb/util/Ray.h
+++ b/nanovdb/nanovdb/util/Ray.h
@@ -1,551 +1,6 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-/// @file Ray.h
-///
-/// @author Ken Museth
-///
-/// @brief A Ray class.
-
-#ifndef NANOVDB_RAY_H_HAS_BEEN_INCLUDED
-#define NANOVDB_RAY_H_HAS_BEEN_INCLUDED
-
-#include <nanovdb/NanoVDB.h> // for Vec3
-
-namespace nanovdb {
-
-template<typename RealT>
-class Ray
-{
-public:
-    using RealType = RealT;
-    using Vec3Type = Vec3<RealT>;
-    using Vec3T = Vec3Type;
-
-    struct TimeSpan
-    {
-        RealT t0, t1;
-        /// @brief Default constructor
-        __hostdev__ TimeSpan() {}
-        /// @brief Constructor
-        __hostdev__ TimeSpan(RealT _t0, RealT _t1)
-            : t0(_t0)
-            , t1(_t1)
-        {
-        }
-        /// @brief Set both times
-        __hostdev__ void set(RealT _t0, RealT _t1)
-        {
-            t0 = _t0;
-            t1 = _t1;
-        }
-        /// @brief Get both times
-        __hostdev__ void get(RealT& _t0, RealT& _t1) const
-        {
-            _t0 = t0;
-            _t1 = t1;
-        }
-        /// @brief Return @c true if t1 is larger than t0 by at least eps.
-        __hostdev__ bool valid(RealT eps = Delta<RealT>::value()) const { return (t1 - t0) > eps; }
-        /// @brief Return the midpoint of the ray.
-        __hostdev__ RealT mid() const { return 0.5 * (t0 + t1); }
-        /// @brief Multiplies both times
-        __hostdev__ void scale(RealT s)
-        {
-            assert(s > 0);
-            t0 *= s;
-            t1 *= s;
-        }
-        /// @brief Return @c true if time is inclusive
-        __hostdev__ bool test(RealT t) const { return (t >= t0 && t <= t1); }
-    };
-
-    __hostdev__ Ray(const Vec3Type& eye = Vec3Type(0, 0, 0),
-                    const Vec3Type& direction = Vec3Type(1, 0, 0),
-                    RealT           t0 = Delta<RealT>::value(),
-                    RealT           t1 = Maximum<RealT>::value())
-        : mEye(eye)
-        , mDir(direction)
-        , mInvDir(1 / mDir[0], 1 / mDir[1], 1 / mDir[2])
-        , mTimeSpan(t0, t1)
-        , mSign{mInvDir[0] < 0, mInvDir[1] < 0, mInvDir[2] < 0}
-    {
-    }
-
-    __hostdev__ Ray& offsetEye(RealT offset)
-    {
-        mEye[0] += offset;
-        mEye[1] += offset;
-        mEye[2] += offset;
-        return *this;
-    }
-
-    __hostdev__ Ray& setEye(const Vec3Type& eye)
-    {
-        mEye = eye;
-        return *this;
-    }
-
-    __hostdev__ Ray& setDir(const Vec3Type& dir)
-    {
-        mDir = dir;
-        mInvDir[0] = 1.0 / mDir[0];
-        mInvDir[1] = 1.0 / mDir[1];
-        mInvDir[2] = 1.0 / mDir[2];
-        mSign[0] = mInvDir[0] < 0;
-        mSign[1] = mInvDir[1] < 0;
-        mSign[2] = mInvDir[2] < 0;
-        return *this;
-    }
-
-    __hostdev__ Ray& setMinTime(RealT t0)
-    {
-        mTimeSpan.t0 = t0;
-        return *this;
-    }
-
-    __hostdev__ Ray& setMaxTime(RealT t1)
-    {
-        mTimeSpan.t1 = t1;
-        return *this;
-    }
-
-    __hostdev__ Ray& setTimes(
-        RealT t0 = Delta<RealT>::value(),
-        RealT t1 = Maximum<RealT>::value())
-    {
-        assert(t0 > 0 && t1 > 0);
-        mTimeSpan.set(t0, t1);
-        return *this;
-    }
-
-    __hostdev__ Ray& scaleTimes(RealT scale)
-    {
-        mTimeSpan.scale(scale);
-        return *this;
-    }
-
-    __hostdev__ Ray& reset(
-        const Vec3Type& eye,
-        const Vec3Type& direction,
-        RealT           t0 = Delta<RealT>::value(),
-        RealT           t1 = Maximum<RealT>::value())
-    {
-        this->setEye(eye);
-        this->setDir(direction);
-        this->setTimes(t0, t1);
-        return *this;
-    }
-
-    __hostdev__ const Vec3T& eye() const { return mEye; }
-
-    __hostdev__ const Vec3T& dir() const { return mDir; }
-
-    __hostdev__ const Vec3T& invDir() const { return mInvDir; }
-
-    __hostdev__ RealT t0() const { return mTimeSpan.t0; }
-
-    __hostdev__ RealT t1() const { return mTimeSpan.t1; }
-
-    __hostdev__ int sign(int i) const { return mSign[i]; }
-
-    /// @brief Return the position along the ray at the specified time.
-    __hostdev__ Vec3T operator()(RealT time) const
-    {
-#if 1
-        return Vec3T(fmaf(time, mDir[0], mEye[0]),
-                     fmaf(time, mDir[1], mEye[1]),
-                     fmaf(time, mDir[2], mEye[2]));
-#else
-        return mEye + mDir * time;
-#endif
-    }
-
-    /// @brief Return the starting point of the ray.
-    __hostdev__ Vec3T start() const { return (*this)(mTimeSpan.t0); }
-
-    /// @brief Return the endpoint of the ray.
-    __hostdev__ Vec3T end() const { return (*this)(mTimeSpan.t1); }
-
-    /// @brief Return the midpoint of the ray.
-    __hostdev__ Vec3T mid() const { return (*this)(mTimeSpan.mid()); }
-
-    /// @brief Return @c true if t1 is larger than t0 by at least eps.
-    __hostdev__ bool valid(RealT eps = Delta<float>::value()) const { return mTimeSpan.valid(eps); }
-
-    /// @brief Return @c true if @a time is within t0 and t1, both inclusive.
-    __hostdev__ bool test(RealT time) const { return mTimeSpan.test(time); }
-
-    /// @brief Return a new Ray that is transformed with the specified map.
-    ///
-    /// @param map  the map from which to construct the new Ray.
-    ///
-    /// @warning Assumes a linear map and a normalized direction.
-    ///
-    /// @details The requirement that the direction is normalized
-    ///          follows from the transformation of t0 and t1 - and that fact that
-    ///          we want applyMap and applyInverseMap to be inverse operations.
-    template<typename MapType>
-    __hostdev__ Ray applyMap(const MapType& map) const
-    {
-        const Vec3T eye = map.applyMap(mEye);
-        const Vec3T dir = map.applyJacobian(mDir);
-        const RealT length = dir.length(), invLength = RealT(1) / length;
-        RealT       t1 = mTimeSpan.t1;
-        if (mTimeSpan.t1 < Maximum<RealT>::value()) {
-            t1 *= length;
-        }
-        return Ray(eye, dir * invLength, length * mTimeSpan.t0, t1);
-    }
-    template<typename MapType>
-    __hostdev__ Ray applyMapF(const MapType& map) const
-    {
-        const Vec3T eye = map.applyMapF(mEye);
-        const Vec3T dir = map.applyJacobianF(mDir);
-        const RealT length = dir.length(), invLength = RealT(1) / length;
-        RealT       t1 = mTimeSpan.t1;
-        if (mTimeSpan.t1 < Maximum<RealT>::value()) {
-            t1 *= length;
-        }
-        return Ray(eye, dir * invLength, length * mTimeSpan.t0, t1);
-    }
-
-    /// @brief Return a new Ray that is transformed with the inverse of the specified map.
-    ///
-    /// @param map  the map from which to construct the new Ray by inverse mapping.
-    ///
-    /// @warning Assumes a linear map and a normalized direction.
-    ///
-    /// @details The requirement that the direction is normalized
-    ///          follows from the transformation of t0 and t1 - and that fact that
-    ///          we want applyMap and applyInverseMap to be inverse operations.
-    template<typename MapType>
-    __hostdev__ Ray applyInverseMap(const MapType& map) const
-    {
-        const Vec3T eye = map.applyInverseMap(mEye);
-        const Vec3T dir = map.applyInverseJacobian(mDir);
-        const RealT length = dir.length(), invLength = RealT(1) / length;
-        return Ray(eye, dir * invLength, length * mTimeSpan.t0, length * mTimeSpan.t1);
-    }
-    template<typename MapType>
-    __hostdev__ Ray applyInverseMapF(const MapType& map) const
-    {
-        const Vec3T eye = map.applyInverseMapF(mEye);
-        const Vec3T dir = map.applyInverseJacobianF(mDir);
-        const RealT length = dir.length(), invLength = RealT(1) / length;
-        return Ray(eye, dir * invLength, length * mTimeSpan.t0, length * mTimeSpan.t1);
-    }
-
-    /// @brief Return a new ray in world space, assuming the existing
-    ///        ray is represented in the index space of the specified grid.
-    template<typename GridType>
-    __hostdev__ Ray indexToWorldF(const GridType& grid) const
-    {
-        const Vec3T eye = grid.indexToWorldF(mEye);
-        const Vec3T dir = grid.indexToWorldDirF(mDir);
-        const RealT length = dir.length(), invLength = RealT(1) / length;
-        RealT       t1 = mTimeSpan.t1;
-        if (mTimeSpan.t1 < Maximum<RealT>::value()) {
-            t1 *= length;
-        }
-        return Ray(eye, dir * invLength, length * mTimeSpan.t0, t1);
-    }
-
-    /// @brief Return a new ray in index space, assuming the existing
-    ///        ray is represented in the world space of the specified grid.
-    template<typename GridType>
-    __hostdev__ Ray worldToIndexF(const GridType& grid) const
-    {
-        const Vec3T eye = grid.worldToIndexF(mEye);
-        const Vec3T dir = grid.worldToIndexDirF(mDir);
-        const RealT length = dir.length(), invLength = RealT(1) / length;
-        RealT       t1 = mTimeSpan.t1;
-        if (mTimeSpan.t1 < Maximum<RealT>::value()) {
-            t1 *= length;
-        }
-        return Ray(eye, dir * invLength, length * mTimeSpan.t0, t1);
-    }
-
-    /// @brief Return true if this ray intersects the specified sphere.
-    ///
-    /// @param center The center of the sphere in the same space as this ray.
-    /// @param radius The radius of the sphere in the same units as this ray.
-    /// @param t0     The first intersection point if an intersection exists.
-    /// @param t1     The second intersection point if an intersection exists.
-    ///
-    /// @note If the return value is true, i.e. a hit, and t0 =
-    ///       this->t0() or t1 == this->t1() only one true intersection exist.
-    __hostdev__ bool intersects(const Vec3T& center, RealT radius, RealT& t0, RealT& t1) const
-    {
-        const Vec3T origin = mEye - center;
-        const RealT A = mDir.lengthSqr();
-        const RealT B = 2 * mDir.dot(origin);
-        const RealT C = origin.lengthSqr() - radius * radius;
-        const RealT D = B * B - 4 * A * C;
-
-        if (D < 0) {
-            return false;
-        }
-        const RealT Q = RealT(-0.5) * (B < 0 ? (B + Sqrt(D)) : (B - Sqrt(D)));
-
-        t0 = Q / A;
-        t1 = C / Q;
-
-        if (t0 > t1) {
-            RealT tmp = t0;
-            t0 = t1;
-            t1 = tmp;
-        }
-        if (t0 < mTimeSpan.t0) {
-            t0 = mTimeSpan.t0;
-        }
-        if (t1 > mTimeSpan.t1) {
-            t1 = mTimeSpan.t1;
-        }
-        return t0 <= t1;
-    }
-
-    /// @brief Return true if this ray intersects the specified sphere.
-    ///
-    /// @param center The center of the sphere in the same space as this ray.
-    /// @param radius The radius of the sphere in the same units as this ray.
-    __hostdev__ bool intersects(const Vec3T& center, RealT radius) const
-    {
-        RealT t0, t1;
-        return this->intersects(center, radius, t0, t1) > 0;
-    }
-
-    /// @brief Return true if this ray intersects the specified sphere.
-    ///
-    /// @note For intersection this ray is clipped to the two intersection points.
-    ///
-    /// @param center The center of the sphere in the same space as this ray.
-    /// @param radius The radius of the sphere in the same units as this ray.
-    __hostdev__ bool clip(const Vec3T& center, RealT radius)
-    {
-        RealT      t0, t1;
-        const bool hit = this->intersects(center, radius, t0, t1);
-        if (hit) {
-            mTimeSpan.set(t0, t1);
-        }
-        return hit;
-    }
-#if 0
-    /// @brief Return true if the Ray intersects the specified
-    ///        axisaligned bounding box.
-    ///
-    /// @param bbox Axis-aligned bounding box in the same space as the Ray.
-    /// @param t0   If an intersection is detected this is assigned
-    ///             the time for the first intersection point.
-    /// @param t1   If an intersection is detected this is assigned
-    ///             the time for the second intersection point.
-    template<typename BBoxT>
-    __hostdev__  bool intersects(const BBoxT& bbox, RealT& t0, RealT& t1) const
-    {
-        t0       = (bbox[  mSign[0]][0] - mEye[0]) * mInvDir[0];
-        RealT t2 = (bbox[1-mSign[1]][1] - mEye[1]) * mInvDir[1];
-        if (t0 > t2) return false;
-        t1       = (bbox[1-mSign[0]][0] - mEye[0]) * mInvDir[0];
-        RealT t3 = (bbox[  mSign[1]][1] - mEye[1]) * mInvDir[1];
-        if (t3 > t1) return false;
-        if (t3 > t0) t0 = t3;
-        if (t2 < t1) t1 = t2;
-        t3 = (bbox[  mSign[2]][2] - mEye[2]) * mInvDir[2];
-        if (t3 > t1) return false;
-        t2 = (bbox[1-mSign[2]][2] - mEye[2]) * mInvDir[2];
-        if (t0 > t2) return false;
-        if (t3 > t0) t0 = t3;
-        if (mTimeSpan.t1 < t0) return false;
-        if (t2 < t1) t1 = t2;
-        if (mTimeSpan.t0 > t1) return false;
-        if (mTimeSpan.t0 > t0) t0 = mTimeSpan.t0;
-        if (mTimeSpan.t1 < t1) t1 = mTimeSpan.t1;
-        return true;
-        /*
-        mTimeSpan.get(_t0, _t1);
-        double t0 = _t0, t1 = _t1;
-        for (int i = 0; i < 3; ++i) {
-            //if (abs(mDir[i])<1e-3) continue;
-            double a = (double(bbox.min()[i]) - mEye[i]) * mInvDir[i];
-            double b = (double(bbox.max()[i]) - mEye[i]) * mInvDir[i];
-            if (a > b) {
-                double tmp = a;
-                a = b;
-                b = tmp;
-            }
-            if (a > t0) t0 = a;
-            if (b < t1) t1 = b;
-            if (t0 > t1) {
-                //if (gVerbose) printf("Missed BBOX: (%i,%i,%i) -> (%i,%i,%i) t0=%f t1=%f\n",
-                //                     bbox.min()[0], bbox.min()[1], bbox.min()[2],
-                //                     bbox.max()[0], bbox.max()[1], bbox.max()[2], t0, t1);
-                return false;
-            }
-        }
-        _t0 = t0; _t1 = t1;
-        return true;
-        */
-    }
-#else
-    /// @brief Returns true if this ray intersects an index bounding box.
-    ///        If the return value is true t0 and t1 are set to the intersection
-    ///        times along the ray.
-    ///
-    /// @warning Intersection with a CoordBBox internally converts to a floating-point bbox
-    ///          which imples that the max is padded with one voxel, i.e. bbox.max += 1! This
-    ///          avoids gaps between neighboring CoordBBox'es, say from neighboring tree nodes.
-    __hostdev__ bool intersects(const CoordBBox& bbox, RealT& t0, RealT& t1) const
-    {
-        mTimeSpan.get(t0, t1);
-        for (int i = 0; i < 3; ++i) {
-            RealT a = RealT(bbox.min()[i]), b = RealT(bbox.max()[i] + 1);
-            if (a >= b) { // empty bounding box
-                return false;
-            }
-            a = (a - mEye[i]) * mInvDir[i];
-            b = (b - mEye[i]) * mInvDir[i];
-            if (a > b) {
-                RealT tmp = a;
-                a = b;
-                b = tmp;
-            }
-            if (a > t0) {
-                t0 = a;
-            }
-            if (b < t1) {
-                t1 = b;
-            }
-            if (t0 > t1) {
-                return false;
-            }
-        }
-        return true;
-    }
-    /// @brief Returns true if this ray intersects a floating-point bounding box.
-    ///        If the return value is true t0 and t1 are set to the intersection
-    ///        times along the ray.
-    template<typename OtherVec3T>
-    __hostdev__ bool intersects(const BBox<OtherVec3T>& bbox, RealT& t0, RealT& t1) const
-    {
-        static_assert(is_floating_point<typename OtherVec3T::ValueType>::value, "Ray::intersects: Expected a floating point coordinate");
-        mTimeSpan.get(t0, t1);
-        for (int i = 0; i < 3; ++i) {
-            RealT a = RealT(bbox.min()[i]), b = RealT(bbox.max()[i]);
-            if (a >= b) { // empty bounding box
-                return false;
-            }
-            a = (a - mEye[i]) * mInvDir[i];
-            b = (b - mEye[i]) * mInvDir[i];
-            if (a > b) {
-                RealT tmp = a;
-                a = b;
-                b = tmp;
-            }
-            if (a > t0) {
-                t0 = a;
-            }
-            if (b < t1) {
-                t1 = b;
-            }
-            if (t0 > t1) {
-                return false;
-            }
-        }
-        return true;
-    }
-#endif
-
-    /// @brief Return true if this ray intersects the specified bounding box.
-    ///
-    /// @param bbox Axis-aligned bounding box in the same space as this ray.
-    ///
-    /// @warning If @a bbox is of the type CoordBBox it is converted to a floating-point
-    ///          bounding box, which imples that the max is padded with one voxel, i.e.
-    ///          bbox.max += 1! This avoids gaps between neighboring CoordBBox'es, say
-    ///          from neighboring tree nodes.
-    template<typename BBoxT>
-    __hostdev__ bool intersects(const BBoxT& bbox) const
-    {
-#if 1
-        RealT t0, t1;
-        return this->intersects(bbox, t0, t1);
-#else
-        //BBox<Vec3T> bbox(Vec3T(_bbox[0][0]-1e-4,_bbox[0][1]-1e-4,_bbox[0][2]-1e-4),
-        //                 Vec3T(_bbox[1][0]+1e-4,_bbox[1][1]+1e-4,_bbox[1][2]+1e-4));
-        RealT t0 = (bbox[mSign[0]][0] - mEye[0]) * mInvDir[0];
-        RealT t2 = (bbox[1 - mSign[1]][1] - mEye[1]) * mInvDir[1];
-        if (t0 > t2) return false;
-        RealT t1 = (bbox[1 - mSign[0]][0] - mEye[0]) * mInvDir[0];
-        RealT t3 = (bbox[mSign[1]][1] - mEye[1]) * mInvDir[1];
-        if (t3 > t1) return false;
-        if (t3 > t0) t0 = t3;
-        if (t2 < t1) t1 = t2;
-        t3 = (bbox[mSign[2]][2] - mEye[2]) * mInvDir[2];
-        if (t3 > t1) return false;
-        t2 = (bbox[1 - mSign[2]][2] - mEye[2]) * mInvDir[2];
-        if (t0 > t2) return false;
-        //if (t3 > t0) t0 = t3;
-        //if (mTimeSpan.t1 < t0) return false;
-        //if (t2 < t1) t1 = t2;
-        //return mTimeSpan.t0 < t1;
-        return true;
-#endif
-    }
-
-    /// @brief Return true if this ray intersects the specified bounding box.
-    ///
-    /// @param bbox Axis-aligned bounding box in the same space as this ray.
-    ///
-    /// @warning If @a bbox is of the type CoordBBox it is converted to a floating-point
-    ///          bounding box, which imples that the max is padded with one voxel, i.e.
-    ///          bbox.max += 1! This avoids gaps between neighboring CoordBBox'es, say
-    ///          from neighboring tree nodes.
-    ///
-    /// @note For intersection this ray is clipped to the two intersection points.
-    template<typename BBoxT>
-    __hostdev__ bool clip(const BBoxT& bbox)
-    {
-        RealT      t0, t1;
-        const bool hit = this->intersects(bbox, t0, t1);
-        if (hit) {
-            mTimeSpan.set(t0, t1);
-        }
-        return hit;
-    }
-
-    /// @brief Return true if the Ray intersects the plane specified
-    ///        by a normal and distance from the origin.
-    ///
-    /// @param normal   Normal of the plane.
-    /// @param distance Distance of the plane to the origin.
-    /// @param t        Time of intersection, if one exists.
-    __hostdev__ bool intersects(const Vec3T& normal, RealT distance, RealT& t) const
-    {
-        const RealT cosAngle = mDir.dot(normal);
-        if (isApproxZero(cosAngle)) {
-            return false; // ray is parallel to plane
-        }
-        t = (distance - mEye.dot(normal)) / cosAngle;
-        return this->test(t);
-    }
-
-    /// @brief Return true if the Ray intersects the plane specified
-    ///        by a normal and point.
-    ///
-    /// @param normal   Normal of the plane.
-    /// @param point    Point in the plane.
-    /// @param t        Time of intersection, if one exists.
-    __hostdev__ bool intersects(const Vec3T& normal, const Vec3T& point, RealT& t) const
-    {
-        return this->intersects(normal, point.dot(normal), t);
-    }
-
-private:
-    Vec3T    mEye, mDir, mInvDir;
-    TimeSpan mTimeSpan;
-    int      mSign[3];
-}; // end of Ray class
-
-} // namespace nanovdb
-
-#endif // NANOVDB_RAY_HAS_BEEN_INCLUDED
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/math/Ray.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/math/Ray.h instead.")
diff --git a/nanovdb/nanovdb/util/Reduce.h b/nanovdb/nanovdb/util/Reduce.h
index 7073d26e05..eb0a5e749c 100644
--- a/nanovdb/nanovdb/util/Reduce.h
+++ b/nanovdb/nanovdb/util/Reduce.h
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: MPL-2.0
 
 /*!
-    \file Reduce.h
+    \file nanovdb/util/Reduce.h
 
     \author Ken Museth
 
@@ -11,10 +11,10 @@
     \brief A unified wrapper for tbb::parallel_reduce and a naive std::future analog
 */
 
-#ifndef NANOVDB_REDUCE_H_HAS_BEEN_INCLUDED
-#define NANOVDB_REDUCE_H_HAS_BEEN_INCLUDED
+#ifndef NANOVDB_UTIL_REDUCE_H_HAS_BEEN_INCLUDED
+#define NANOVDB_UTIL_REDUCE_H_HAS_BEEN_INCLUDED
 
-#include "Range.h"// for Range1D
+#include <nanovdb/util/Range.h>// for util::Range1D
 
 #ifdef NANOVDB_USE_TBB
 #include <tbb/parallel_reduce.h>
@@ -26,6 +26,8 @@
 
 namespace nanovdb {
 
+namespace util {
+
 /// @return reduction
 ///
 /// @param range  RangeT can be Range<dim,T>, CoordBBox, tbb::blocked_range, blocked_range2D, or blocked_range3D.
@@ -37,7 +39,6 @@ namespace nanovdb {
 ///     auto func = [&array](auto &r, int a){for (auto i=r.begin(); i!=r.end(); ++i) a+=array[i]; return a;};
 ///     int sum = reduce(array, 0, func, [](int a, int b){return a + b;});
 /// @endcode
-
 template <typename RangeT, typename T, typename FuncT, typename JoinT>
 inline T reduce(RangeT range, const T& identity, const FuncT &func, const JoinT &join)
 {
@@ -73,7 +74,7 @@ inline T reduce(RangeT range, const T& identity, const FuncT &func, const JoinT
 }
 
 /// @brief Simple wrapper to the function defined above
-template <typename T, typename FuncT, typename JoinT >
+template <typename T, typename FuncT, typename JoinT>
 inline T reduce(size_t begin, size_t end, size_t grainSize, const T& identity, const FuncT& func, const JoinT& join)
 {
     Range1D range(begin, end, grainSize);
@@ -97,6 +98,36 @@ inline T reduce(const ContainerT<ArgT...> &c, size_t grainSize, const T& identit
     return reduce( range, identity, func, join );
 }
 
+}// namespace util
+
+/// @brief Simple wrapper to the function defined above
+template <typename T, typename FuncT, typename JoinT>
+[[deprecated("Use nanovdb::util::reduce instead")]]
+inline T reduce(size_t begin, size_t end, size_t grainSize, const T& identity, const FuncT& func, const JoinT& join)
+{
+    util::Range1D range(begin, end, grainSize);
+    return util::reduce( range, identity, func, join );
+}
+
+/// @brief Simple wrapper that works with std::containers
+template <template<typename...> class ContainerT, typename... ArgT, typename T, typename FuncT, typename JoinT >
+[[deprecated("Use nanovdb::util::reduce instead")]]
+inline T reduce(const ContainerT<ArgT...> &c, const T& identity, const FuncT& func, const JoinT& join)
+{
+    util::Range1D range(0, c.size(), 1);
+    return util::reduce( range, identity, func, join );
+
+}
+
+/// @brief Simple wrapper that works with std::containers
+template <template<typename...> class ContainerT, typename... ArgT, typename T, typename FuncT, typename JoinT >
+[[deprecated("Use nanovdb::util::reduce instead")]]
+T reduce(const ContainerT<ArgT...> &c, size_t grainSize, const T& identity, const FuncT& func, const JoinT& join)
+{
+    util::Range1D range(0, c.size(), grainSize);
+    return util::reduce( range, identity, func, join );
+}
+
 }// namespace nanovdb
 
-#endif // NANOVDB_REDUCE_H_HAS_BEEN_INCLUDED
+#endif // NANOVDB_UTIL_REDUCE_H_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/util/SampleFromVoxels.h b/nanovdb/nanovdb/util/SampleFromVoxels.h
index e779d66cf6..b40ea82677 100644
--- a/nanovdb/nanovdb/util/SampleFromVoxels.h
+++ b/nanovdb/nanovdb/util/SampleFromVoxels.h
@@ -1,983 +1,6 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-//////////////////////////////////////////////////////////////////////////
-///
-/// @file SampleFromVoxels.h
-///
-/// @brief NearestNeighborSampler, TrilinearSampler, TriquadraticSampler and TricubicSampler
-///
-/// @note These interpolators employ internal caching for better performance when used repeatedly
-///       in the same voxel location, so try to reuse an instance of these classes more than once.
-///
-/// @warning While all the interpolators defined below work with both scalars and vectors
-///          values (e.g. float and Vec3<float>) TrilinarSampler::zeroCrossing and
-///          Trilinear::gradient will only compile with floating point value types.
-///
-/// @author Ken Museth
-///
-///////////////////////////////////////////////////////////////////////////
-
-#ifndef NANOVDB_SAMPLE_FROM_VOXELS_H_HAS_BEEN_INCLUDED
-#define NANOVDB_SAMPLE_FROM_VOXELS_H_HAS_BEEN_INCLUDED
-
-// Only define __hostdev__ when compiling as NVIDIA CUDA
-#if defined(__CUDACC__) || defined(__HIP__)
-#define __hostdev__ __host__ __device__
-#else
-#include <cmath> // for floor
-#define __hostdev__
-#endif
-
-namespace nanovdb {
-
-// Forward declaration of sampler with specific polynomial orders
-template<typename TreeT, int Order, bool UseCache = true>
-class SampleFromVoxels;
-
-/// @brief Factory free-function for a sampler of specific polynomial orders
-///
-/// @details This allows for the compact syntax:
-/// @code
-///   auto acc = grid.getAccessor();
-///   auto smp = nanovdb::createSampler<1>( acc );
-/// @endcode
-template<int Order, typename TreeOrAccT, bool UseCache = true>
-__hostdev__ SampleFromVoxels<TreeOrAccT, Order, UseCache> createSampler(const TreeOrAccT& acc)
-{
-    return SampleFromVoxels<TreeOrAccT, Order, UseCache>(acc);
-}
-
-/// @brief Utility function that returns the Coord of the round-down of @a xyz
-///        and redefined @xyz as the fractional part, ie xyz-in = return-value + xyz-out
-template<typename CoordT, typename RealT, template<typename> class Vec3T>
-__hostdev__ inline CoordT Floor(Vec3T<RealT>& xyz);
-
-/// @brief Template specialization of Floor for Vec3<float>
-template<typename CoordT, template<typename> class Vec3T>
-__hostdev__ inline CoordT Floor(Vec3T<float>& xyz)
-{
-    const float ijk[3] = {floorf(xyz[0]), floorf(xyz[1]), floorf(xyz[2])};
-    xyz[0] -= ijk[0];
-    xyz[1] -= ijk[1];
-    xyz[2] -= ijk[2];
-    return CoordT(int32_t(ijk[0]), int32_t(ijk[1]), int32_t(ijk[2]));
-}
-
-/// @brief Template specialization of Floor for Vec3<float>
-template<typename CoordT, template<typename> class Vec3T>
-__hostdev__ inline CoordT Floor(Vec3T<double>& xyz)
-{
-    const double ijk[3] = {floor(xyz[0]), floor(xyz[1]), floor(xyz[2])};
-    xyz[0] -= ijk[0];
-    xyz[1] -= ijk[1];
-    xyz[2] -= ijk[2];
-    return CoordT(int32_t(ijk[0]), int32_t(ijk[1]), int32_t(ijk[2]));
-}
-
-// ------------------------------> NearestNeighborSampler <--------------------------------------
-
-/// @brief Nearest neighbor, i.e. zero order, interpolator with caching
-template<typename TreeOrAccT>
-class SampleFromVoxels<TreeOrAccT, 0, true>
-{
-public:
-    using ValueT = typename TreeOrAccT::ValueType;
-    using CoordT = typename TreeOrAccT::CoordType;
-
-    static const int ORDER = 0;
-    /// @brief Construction from a Tree or ReadAccessor
-    __hostdev__ SampleFromVoxels(const TreeOrAccT& acc)
-        : mAcc(acc)
-        , mPos(CoordT::max())
-    {
-    }
-
-    __hostdev__ const TreeOrAccT& accessor() const { return mAcc; }
-
-    /// @note xyz is in index space space
-    template<typename Vec3T>
-    inline __hostdev__ ValueT operator()(const Vec3T& xyz) const;
-
-    inline __hostdev__ ValueT operator()(const CoordT& ijk) const;
-
-private:
-    const TreeOrAccT& mAcc;
-    mutable CoordT    mPos;
-    mutable ValueT    mVal; // private cache
-}; // SampleFromVoxels<TreeOrAccT, 0, true>
-
-/// @brief Nearest neighbor, i.e. zero order, interpolator without caching
-template<typename TreeOrAccT>
-class SampleFromVoxels<TreeOrAccT, 0, false>
-{
-public:
-    using ValueT = typename TreeOrAccT::ValueType;
-    using CoordT = typename TreeOrAccT::CoordType;
-    static const int ORDER = 0;
-
-    /// @brief Construction from a Tree or ReadAccessor
-    __hostdev__ SampleFromVoxels(const TreeOrAccT& acc)
-        : mAcc(acc)
-    {
-    }
-
-    __hostdev__ const TreeOrAccT& accessor() const { return mAcc; }
-
-    /// @note xyz is in index space space
-    template<typename Vec3T>
-    inline __hostdev__ ValueT operator()(const Vec3T& xyz) const;
-
-    inline __hostdev__ ValueT operator()(const CoordT& ijk) const { return mAcc.getValue(ijk);}
-
-private:
-    const TreeOrAccT& mAcc;
-}; // SampleFromVoxels<TreeOrAccT, 0, false>
-
-template<typename TreeOrAccT>
-template<typename Vec3T>
-__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 0, true>::operator()(const Vec3T& xyz) const
-{
-    const CoordT ijk = Round<CoordT>(xyz);
-    if (ijk != mPos) {
-        mPos = ijk;
-        mVal = mAcc.getValue(mPos);
-    }
-    return mVal;
-}
-
-template<typename TreeOrAccT>
-__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 0, true>::operator()(const CoordT& ijk) const
-{
-    if (ijk != mPos) {
-        mPos = ijk;
-        mVal = mAcc.getValue(mPos);
-    }
-    return mVal;
-}
-
-template<typename TreeOrAccT>
-template<typename Vec3T>
-__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 0, false>::operator()(const Vec3T& xyz) const
-{
-    return mAcc.getValue(Round<CoordT>(xyz));
-}
-
-// ------------------------------> TrilinearSampler <--------------------------------------
-
-/// @brief Tri-linear sampler, i.e. first order, interpolator
-template<typename TreeOrAccT>
-class TrilinearSampler
-{
-protected:
-    const TreeOrAccT& mAcc;
-
-public:
-    using ValueT = typename TreeOrAccT::ValueType;
-    using CoordT = typename TreeOrAccT::CoordType;
-    static const int ORDER = 1;
-
-    /// @brief Protected constructor from a Tree or ReadAccessor
-    __hostdev__ TrilinearSampler(const TreeOrAccT& acc) : mAcc(acc) {}
-
-    __hostdev__ const TreeOrAccT& accessor() const { return mAcc; }
-
-    /// @brief Extract the stencil of 8 values
-    inline __hostdev__ void stencil(CoordT& ijk, ValueT (&v)[2][2][2]) const;
-
-    template<typename RealT, template<typename...> class Vec3T>
-    static inline __hostdev__ ValueT sample(const Vec3T<RealT> &uvw, const ValueT (&v)[2][2][2]);
-
-    template<typename RealT, template<typename...> class Vec3T>
-    static inline __hostdev__ Vec3T<ValueT> gradient(const Vec3T<RealT> &uvw, const ValueT (&v)[2][2][2]);
-
-    static inline __hostdev__ bool zeroCrossing(const ValueT (&v)[2][2][2]);
-}; // TrilinearSamplerBase
-
-template<typename TreeOrAccT>
-__hostdev__ void TrilinearSampler<TreeOrAccT>::stencil(CoordT& ijk, ValueT (&v)[2][2][2]) const
-{
-    v[0][0][0] = mAcc.getValue(ijk); // i, j, k
-
-    ijk[2] += 1;
-    v[0][0][1] = mAcc.getValue(ijk); // i, j, k + 1
-
-    ijk[1] += 1;
-    v[0][1][1] = mAcc.getValue(ijk); // i, j+1, k + 1
-
-    ijk[2] -= 1;
-    v[0][1][0] = mAcc.getValue(ijk); // i, j+1, k
-
-    ijk[0] += 1;
-    ijk[1] -= 1;
-    v[1][0][0] = mAcc.getValue(ijk); // i+1, j, k
-
-    ijk[2] += 1;
-    v[1][0][1] = mAcc.getValue(ijk); // i+1, j, k + 1
-
-    ijk[1] += 1;
-    v[1][1][1] = mAcc.getValue(ijk); // i+1, j+1, k + 1
-
-    ijk[2] -= 1;
-    v[1][1][0] = mAcc.getValue(ijk); // i+1, j+1, k
-}
-
-template<typename TreeOrAccT>
-template<typename RealT, template<typename...> class Vec3T>
-__hostdev__ typename TreeOrAccT::ValueType TrilinearSampler<TreeOrAccT>::sample(const Vec3T<RealT> &uvw, const ValueT (&v)[2][2][2])
-{
-#if 0
-  auto lerp = [](ValueT a, ValueT b, ValueT w){ return fma(w, b-a, a); };// = w*(b-a) + a
-  //auto lerp = [](ValueT a, ValueT b, ValueT w){ return fma(w, b, fma(-w, a, a));};// = (1-w)*a + w*b
-#else
-    auto lerp = [](ValueT a, ValueT b, RealT w) { return a + ValueT(w) * (b - a); };
-#endif
-    return lerp(lerp(lerp(v[0][0][0], v[0][0][1], uvw[2]), lerp(v[0][1][0], v[0][1][1], uvw[2]), uvw[1]),
-                lerp(lerp(v[1][0][0], v[1][0][1], uvw[2]), lerp(v[1][1][0], v[1][1][1], uvw[2]), uvw[1]),
-                uvw[0]);
-}
-
-template<typename TreeOrAccT>
-template<typename RealT, template<typename...> class Vec3T>
-__hostdev__ Vec3T<typename TreeOrAccT::ValueType> TrilinearSampler<TreeOrAccT>::gradient(const Vec3T<RealT> &uvw, const ValueT (&v)[2][2][2])
-{
-    static_assert(is_floating_point<ValueT>::value, "TrilinearSampler::gradient requires a floating-point type");
-#if 0
-  auto lerp = [](ValueT a, ValueT b, ValueT w){ return fma(w, b-a, a); };// = w*(b-a) + a
-  //auto lerp = [](ValueT a, ValueT b, ValueT w){ return fma(w, b, fma(-w, a, a));};// = (1-w)*a + w*b
-#else
-    auto lerp = [](ValueT a, ValueT b, RealT w) { return a + ValueT(w) * (b - a); };
-#endif
-
-    ValueT D[4] = {v[0][0][1] - v[0][0][0], v[0][1][1] - v[0][1][0], v[1][0][1] - v[1][0][0], v[1][1][1] - v[1][1][0]};
-
-    // Z component
-    Vec3T<ValueT> grad(0, 0, lerp(lerp(D[0], D[1], uvw[1]), lerp(D[2], D[3], uvw[1]), uvw[0]));
-
-    const ValueT w = ValueT(uvw[2]);
-    D[0] = v[0][0][0] + D[0] * w;
-    D[1] = v[0][1][0] + D[1] * w;
-    D[2] = v[1][0][0] + D[2] * w;
-    D[3] = v[1][1][0] + D[3] * w;
-
-    // X component
-    grad[0] = lerp(D[2], D[3], uvw[1]) - lerp(D[0], D[1], uvw[1]);
-
-    // Y component
-    grad[1] = lerp(D[1] - D[0], D[3] - D[2], uvw[0]);
-
-    return grad;
-}
-
-template<typename TreeOrAccT>
-__hostdev__ bool TrilinearSampler<TreeOrAccT>::zeroCrossing(const ValueT (&v)[2][2][2])
-{
-    static_assert(is_floating_point<ValueT>::value, "TrilinearSampler::zeroCrossing requires a floating-point type");
-    const bool less = v[0][0][0] < ValueT(0);
-    return (less ^ (v[0][0][1] < ValueT(0))) ||
-           (less ^ (v[0][1][1] < ValueT(0))) ||
-           (less ^ (v[0][1][0] < ValueT(0))) ||
-           (less ^ (v[1][0][0] < ValueT(0))) ||
-           (less ^ (v[1][0][1] < ValueT(0))) ||
-           (less ^ (v[1][1][1] < ValueT(0))) ||
-           (less ^ (v[1][1][0] < ValueT(0)));
-}
-
-/// @brief Template specialization that does not use caching of stencil points
-template<typename TreeOrAccT>
-class SampleFromVoxels<TreeOrAccT, 1, false> : public TrilinearSampler<TreeOrAccT>
-{
-    using BaseT = TrilinearSampler<TreeOrAccT>;
-    using ValueT = typename TreeOrAccT::ValueType;
-    using CoordT = typename TreeOrAccT::CoordType;
-
-public:
-
-    /// @brief Construction from a Tree or ReadAccessor
-    __hostdev__ SampleFromVoxels(const TreeOrAccT& acc) : BaseT(acc) {}
-
-    /// @note xyz is in index space space
-    template<typename RealT, template<typename...> class Vec3T>
-    inline __hostdev__ ValueT operator()(Vec3T<RealT> xyz) const;
-
-    /// @note ijk is in index space space
-    __hostdev__ ValueT operator()(const CoordT &ijk) const {return BaseT::mAcc.getValue(ijk);}
-
-    /// @brief Return the gradient in index space.
-    ///
-    /// @warning Will only compile with floating point value types
-    template<typename RealT, template<typename...> class Vec3T>
-    inline __hostdev__ Vec3T<ValueT> gradient(Vec3T<RealT> xyz) const;
-
-    /// @brief Return true if the tr-linear stencil has a zero crossing at the specified index position.
-    ///
-    /// @warning Will only compile with floating point value types
-    template<typename RealT, template<typename...> class Vec3T>
-    inline __hostdev__ bool zeroCrossing(Vec3T<RealT> xyz) const;
-
-}; // SampleFromVoxels<TreeOrAccT, 1, false>
-
-/// @brief Template specialization with caching of stencil values
-template<typename TreeOrAccT>
-class SampleFromVoxels<TreeOrAccT, 1, true> : public TrilinearSampler<TreeOrAccT>
-{
-    using BaseT = TrilinearSampler<TreeOrAccT>;
-    using ValueT = typename TreeOrAccT::ValueType;
-    using CoordT = typename TreeOrAccT::CoordType;
-
-    mutable CoordT mPos;
-    mutable ValueT mVal[2][2][2];
-
-    template<typename RealT, template<typename...> class Vec3T>
-    __hostdev__ void cache(Vec3T<RealT>& xyz) const;
-public:
-
-    /// @brief Construction from a Tree or ReadAccessor
-    __hostdev__ SampleFromVoxels(const TreeOrAccT& acc) : BaseT(acc), mPos(CoordT::max()){}
-
-    /// @note xyz is in index space space
-    template<typename RealT, template<typename...> class Vec3T>
-    inline __hostdev__ ValueT operator()(Vec3T<RealT> xyz) const;
-
-    // @note ijk is in index space space
-    __hostdev__ ValueT operator()(const CoordT &ijk) const;
-
-    /// @brief Return the gradient in index space.
-    ///
-    /// @warning Will only compile with floating point value types
-    template<typename RealT, template<typename...> class Vec3T>
-    inline __hostdev__ Vec3T<ValueT> gradient(Vec3T<RealT> xyz) const;
-
-    /// @brief Return true if the tr-linear stencil has a zero crossing at the specified index position.
-    ///
-    /// @warning Will only compile with floating point value types
-    template<typename RealT, template<typename...> class Vec3T>
-    inline __hostdev__ bool zeroCrossing(Vec3T<RealT> xyz) const;
-
-    /// @brief Return true if the cached tri-linear stencil has a zero crossing.
-    ///
-    /// @warning Will only compile with floating point value types
-    __hostdev__ bool zeroCrossing() const { return BaseT::zeroCrossing(mVal); }
-
-}; // SampleFromVoxels<TreeOrAccT, 1, true>
-
-template<typename TreeOrAccT>
-template<typename RealT, template<typename...> class Vec3T>
-__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 1, true>::operator()(Vec3T<RealT> xyz) const
-{
-    this->cache(xyz);
-    return BaseT::sample(xyz, mVal);
-}
-
-template<typename TreeOrAccT>
-__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 1, true>::operator()(const CoordT &ijk) const
-{
-    return  ijk == mPos ? mVal[0][0][0] : BaseT::mAcc.getValue(ijk);
-}
-
-template<typename TreeOrAccT>
-template<typename RealT, template<typename...> class Vec3T>
-__hostdev__ Vec3T<typename TreeOrAccT::ValueType> SampleFromVoxels<TreeOrAccT, 1, true>::gradient(Vec3T<RealT> xyz) const
-{
-    this->cache(xyz);
-    return BaseT::gradient(xyz, mVal);
-}
-
-template<typename TreeOrAccT>
-template<typename RealT, template<typename...> class Vec3T>
-__hostdev__ bool SampleFromVoxels<TreeOrAccT, 1, true>::zeroCrossing(Vec3T<RealT> xyz) const
-{
-    this->cache(xyz);
-    return BaseT::zeroCrossing(mVal);
-}
-
-template<typename TreeOrAccT>
-template<typename RealT, template<typename...> class Vec3T>
-__hostdev__ void SampleFromVoxels<TreeOrAccT, 1, true>::cache(Vec3T<RealT>& xyz) const
-{
-    CoordT ijk = Floor<CoordT>(xyz);
-    if (ijk != mPos) {
-        mPos = ijk;
-        BaseT::stencil(ijk, mVal);
-    }
-}
-
-#if 0
-
-template<typename TreeOrAccT>
-template<typename RealT, template<typename...> class Vec3T>
-__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 1, false>::operator()(Vec3T<RealT> xyz) const
-{
-    ValueT val[2][2][2];
-    CoordT ijk = Floor<CoordT>(xyz);
-    BaseT::stencil(ijk, val);
-    return BaseT::sample(xyz, val);
-}
-
-#else
-
-template<typename TreeOrAccT>
-template<typename RealT, template<typename...> class Vec3T>
-__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 1, false>::operator()(Vec3T<RealT> xyz) const
-{
-    auto lerp = [](ValueT a, ValueT b, RealT w) { return a + ValueT(w) * (b - a); };
-
-    CoordT coord = Floor<CoordT>(xyz);
-
-    ValueT vx, vx1, vy, vy1, vz, vz1;
-
-    vz = BaseT::mAcc.getValue(coord);
-    coord[2] += 1;
-    vz1 = BaseT::mAcc.getValue(coord);
-    vy = lerp(vz, vz1, xyz[2]);
-
-    coord[1] += 1;
-
-    vz1 = BaseT::mAcc.getValue(coord);
-    coord[2] -= 1;
-    vz = BaseT::mAcc.getValue(coord);
-    vy1 = lerp(vz, vz1, xyz[2]);
-
-    vx = lerp(vy, vy1, xyz[1]);
-
-    coord[0] += 1;
-
-    vz = BaseT::mAcc.getValue(coord);
-    coord[2] += 1;
-    vz1 = BaseT::mAcc.getValue(coord);
-    vy1 = lerp(vz, vz1, xyz[2]);
-
-    coord[1] -= 1;
-
-    vz1 = BaseT::mAcc.getValue(coord);
-    coord[2] -= 1;
-    vz = BaseT::mAcc.getValue(coord);
-    vy = lerp(vz, vz1, xyz[2]);
-
-    vx1 = lerp(vy, vy1, xyz[1]);
-
-    return lerp(vx, vx1, xyz[0]);
-}
-#endif
-
-
-template<typename TreeOrAccT>
-template<typename RealT, template<typename...> class Vec3T>
-__hostdev__ inline Vec3T<typename TreeOrAccT::ValueType> SampleFromVoxels<TreeOrAccT, 1, false>::gradient(Vec3T<RealT> xyz) const
-{
-    ValueT val[2][2][2];
-    CoordT ijk = Floor<CoordT>(xyz);
-    BaseT::stencil(ijk, val);
-    return BaseT::gradient(xyz, val);
-}
-
-template<typename TreeOrAccT>
-template<typename RealT, template<typename...> class Vec3T>
-__hostdev__ bool SampleFromVoxels<TreeOrAccT, 1, false>::zeroCrossing(Vec3T<RealT> xyz) const
-{
-    ValueT val[2][2][2];
-    CoordT ijk = Floor<CoordT>(xyz);
-    BaseT::stencil(ijk, val);
-    return BaseT::zeroCrossing(val);
-}
-
-// ------------------------------> TriquadraticSampler <--------------------------------------
-
-/// @brief Tri-quadratic sampler, i.e. second order, interpolator
-template<typename TreeOrAccT>
-class TriquadraticSampler
-{
-protected:
-    const TreeOrAccT& mAcc;
-
-public:
-    using ValueT = typename TreeOrAccT::ValueType;
-    using CoordT = typename TreeOrAccT::CoordType;
-    static const int ORDER = 1;
-
-    /// @brief Protected constructor from a Tree or ReadAccessor
-    __hostdev__ TriquadraticSampler(const TreeOrAccT& acc) : mAcc(acc) {}
-
-    __hostdev__ const TreeOrAccT& accessor() const { return mAcc; }
-
-    /// @brief Extract the stencil of 27 values
-    inline __hostdev__ void stencil(const CoordT &ijk, ValueT (&v)[3][3][3]) const;
-
-    template<typename RealT, template<typename...> class Vec3T>
-    static inline __hostdev__ ValueT sample(const Vec3T<RealT> &uvw, const ValueT (&v)[3][3][3]);
-
-    static inline __hostdev__ bool zeroCrossing(const ValueT (&v)[3][3][3]);
-}; // TriquadraticSamplerBase
-
-template<typename TreeOrAccT>
-__hostdev__ void TriquadraticSampler<TreeOrAccT>::stencil(const CoordT &ijk, ValueT (&v)[3][3][3]) const
-{
-    CoordT p(ijk[0] - 1, 0, 0);
-    for (int dx = 0; dx < 3; ++dx, ++p[0]) {
-        p[1] = ijk[1] - 1;
-        for (int dy = 0; dy < 3; ++dy, ++p[1]) {
-            p[2] = ijk[2] - 1;
-            for (int dz = 0; dz < 3; ++dz, ++p[2]) {
-                v[dx][dy][dz] = mAcc.getValue(p);// extract the stencil of 27 values
-            }
-        }
-    }
-}
-
-template<typename TreeOrAccT>
-template<typename RealT, template<typename...> class Vec3T>
-__hostdev__ typename TreeOrAccT::ValueType TriquadraticSampler<TreeOrAccT>::sample(const Vec3T<RealT> &uvw, const ValueT (&v)[3][3][3])
-{
-    auto kernel = [](const ValueT* value, double weight)->ValueT {
-        return weight * (weight * (0.5f * (value[0] + value[2]) - value[1]) +
-                        0.5f * (value[2] - value[0])) + value[1];
-    };
-
-    ValueT vx[3];
-    for (int dx = 0; dx < 3; ++dx) {
-        ValueT vy[3];
-        for (int dy = 0; dy < 3; ++dy) {
-            vy[dy] = kernel(&v[dx][dy][0], uvw[2]);
-        }//loop over y
-        vx[dx] = kernel(vy, uvw[1]);
-    }//loop over x
-    return kernel(vx, uvw[0]);
-}
-
-template<typename TreeOrAccT>
-__hostdev__ bool TriquadraticSampler<TreeOrAccT>::zeroCrossing(const ValueT (&v)[3][3][3])
-{
-    static_assert(is_floating_point<ValueT>::value, "TrilinearSampler::zeroCrossing requires a floating-point type");
-    const bool less = v[0][0][0] < ValueT(0);
-    for (int dx = 0; dx < 3; ++dx) {
-        for (int dy = 0; dy < 3; ++dy) {
-            for (int dz = 0; dz < 3; ++dz) {
-                if (less ^ (v[dx][dy][dz] < ValueT(0))) return true;
-            }
-        }
-    }
-    return false;
-}
-
-/// @brief Template specialization that does not use caching of stencil points
-template<typename TreeOrAccT>
-class SampleFromVoxels<TreeOrAccT, 2, false> : public TriquadraticSampler<TreeOrAccT>
-{
-    using BaseT = TriquadraticSampler<TreeOrAccT>;
-    using ValueT = typename TreeOrAccT::ValueType;
-    using CoordT = typename TreeOrAccT::CoordType;
-public:
-
-    /// @brief Construction from a Tree or ReadAccessor
-    __hostdev__ SampleFromVoxels(const TreeOrAccT& acc) : BaseT(acc) {}
-
-    /// @note xyz is in index space space
-    template<typename RealT, template<typename...> class Vec3T>
-    inline __hostdev__ ValueT operator()(Vec3T<RealT> xyz) const;
-
-    __hostdev__ ValueT operator()(const CoordT &ijk) const {return BaseT::mAcc.getValue(ijk);}
-
-    /// @brief Return true if the tr-linear stencil has a zero crossing at the specified index position.
-    ///
-    /// @warning Will only compile with floating point value types
-    template<typename RealT, template<typename...> class Vec3T>
-    inline __hostdev__ bool zeroCrossing(Vec3T<RealT> xyz) const;
-
-}; // SampleFromVoxels<TreeOrAccT, 2, false>
-
-/// @brief Template specialization with caching of stencil values
-template<typename TreeOrAccT>
-class SampleFromVoxels<TreeOrAccT, 2, true> : public TriquadraticSampler<TreeOrAccT>
-{
-    using BaseT = TriquadraticSampler<TreeOrAccT>;
-    using ValueT = typename TreeOrAccT::ValueType;
-    using CoordT = typename TreeOrAccT::CoordType;
-
-    mutable CoordT mPos;
-    mutable ValueT mVal[3][3][3];
-
-    template<typename RealT, template<typename...> class Vec3T>
-    __hostdev__ void cache(Vec3T<RealT>& xyz) const;
-public:
-
-    /// @brief Construction from a Tree or ReadAccessor
-    __hostdev__ SampleFromVoxels(const TreeOrAccT& acc) : BaseT(acc), mPos(CoordT::max()){}
-
-    /// @note xyz is in index space space
-    template<typename RealT, template<typename...> class Vec3T>
-    inline __hostdev__ ValueT operator()(Vec3T<RealT> xyz) const;
-
-    inline __hostdev__ ValueT operator()(const CoordT &ijk) const;
-
-    /// @brief Return true if the tr-linear stencil has a zero crossing at the specified index position.
-    ///
-    /// @warning Will only compile with floating point value types
-    template<typename RealT, template<typename...> class Vec3T>
-    inline __hostdev__ bool zeroCrossing(Vec3T<RealT> xyz) const;
-
-    /// @brief Return true if the cached tri-linear stencil has a zero crossing.
-    ///
-    /// @warning Will only compile with floating point value types
-    __hostdev__ bool zeroCrossing() const { return BaseT::zeroCrossing(mVal); }
-
-}; // SampleFromVoxels<TreeOrAccT, 2, true>
-
-template<typename TreeOrAccT>
-template<typename RealT, template<typename...> class Vec3T>
-__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 2, true>::operator()(Vec3T<RealT> xyz) const
-{
-    this->cache(xyz);
-    return BaseT::sample(xyz, mVal);
-}
-
-template<typename TreeOrAccT>
-__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 2, true>::operator()(const CoordT &ijk) const
-{
-    return  ijk == mPos ? mVal[1][1][1] : BaseT::mAcc.getValue(ijk);
-}
-
-template<typename TreeOrAccT>
-template<typename RealT, template<typename...> class Vec3T>
-__hostdev__ bool SampleFromVoxels<TreeOrAccT, 2, true>::zeroCrossing(Vec3T<RealT> xyz) const
-{
-    this->cache(xyz);
-    return BaseT::zeroCrossing(mVal);
-}
-
-template<typename TreeOrAccT>
-template<typename RealT, template<typename...> class Vec3T>
-__hostdev__ void SampleFromVoxels<TreeOrAccT, 2, true>::cache(Vec3T<RealT>& xyz) const
-{
-    CoordT ijk = Floor<CoordT>(xyz);
-    if (ijk != mPos) {
-        mPos = ijk;
-        BaseT::stencil(ijk, mVal);
-    }
-}
-
-template<typename TreeOrAccT>
-template<typename RealT, template<typename...> class Vec3T>
-__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 2, false>::operator()(Vec3T<RealT> xyz) const
-{
-    ValueT val[3][3][3];
-    CoordT ijk = Floor<CoordT>(xyz);
-    BaseT::stencil(ijk, val);
-    return BaseT::sample(xyz, val);
-}
-
-template<typename TreeOrAccT>
-template<typename RealT, template<typename...> class Vec3T>
-__hostdev__ bool SampleFromVoxels<TreeOrAccT, 2, false>::zeroCrossing(Vec3T<RealT> xyz) const
-{
-    ValueT val[3][3][3];
-    CoordT ijk = Floor<CoordT>(xyz);
-    BaseT::stencil(ijk, val);
-    return BaseT::zeroCrossing(val);
-}
-
-// ------------------------------> TricubicSampler <--------------------------------------
-
-/// @brief Tri-cubic sampler, i.e. third order, interpolator.
-///
-/// @details See the following paper for implementation details:
-/// Lekien, F. and Marsden, J.: Tricubic interpolation in three dimensions.
-///                         In: International Journal for Numerical Methods
-///                         in Engineering (2005), No. 63, p. 455-471
-
-template<typename TreeOrAccT>
-class TricubicSampler
-{
-protected:
-    using ValueT = typename TreeOrAccT::ValueType;
-    using CoordT = typename TreeOrAccT::CoordType;
-
-    const TreeOrAccT& mAcc;
-
-public:
-    /// @brief Construction from a Tree or ReadAccessor
-    __hostdev__ TricubicSampler(const TreeOrAccT& acc)
-        : mAcc(acc)
-    {
-    }
-
-    __hostdev__ const TreeOrAccT& accessor() const { return mAcc; }
-
-     /// @brief Extract the stencil of 8 values
-    inline __hostdev__ void stencil(const CoordT& ijk, ValueT (&c)[64]) const;
-
-    template<typename RealT, template<typename...> class Vec3T>
-    static inline __hostdev__ ValueT sample(const Vec3T<RealT> &uvw, const ValueT (&c)[64]);
-}; // TricubicSampler
-
-template<typename TreeOrAccT>
-__hostdev__ void TricubicSampler<TreeOrAccT>::stencil(const CoordT& ijk, ValueT (&C)[64]) const
-{
-    auto fetch = [&](int i, int j, int k) -> ValueT& { return C[((i + 1) << 4) + ((j + 1) << 2) + k + 1]; };
-
-    // fetch 64 point stencil values
-    for (int i = -1; i < 3; ++i) {
-        for (int j = -1; j < 3; ++j) {
-            fetch(i, j, -1) = mAcc.getValue(ijk + CoordT(i, j, -1));
-            fetch(i, j,  0) = mAcc.getValue(ijk + CoordT(i, j,  0));
-            fetch(i, j,  1) = mAcc.getValue(ijk + CoordT(i, j,  1));
-            fetch(i, j,  2) = mAcc.getValue(ijk + CoordT(i, j,  2));
-        }
-    }
-    const ValueT half(0.5), quarter(0.25), eighth(0.125);
-    const ValueT X[64] = {// values of f(x,y,z) at the 8 corners (each from 1 stencil value).
-                          fetch(0, 0, 0),
-                          fetch(1, 0, 0),
-                          fetch(0, 1, 0),
-                          fetch(1, 1, 0),
-                          fetch(0, 0, 1),
-                          fetch(1, 0, 1),
-                          fetch(0, 1, 1),
-                          fetch(1, 1, 1),
-                          // values of df/dx at the 8 corners (each from 2 stencil values).
-                          half * (fetch(1, 0, 0) - fetch(-1, 0, 0)),
-                          half * (fetch(2, 0, 0) - fetch(0, 0, 0)),
-                          half * (fetch(1, 1, 0) - fetch(-1, 1, 0)),
-                          half * (fetch(2, 1, 0) - fetch(0, 1, 0)),
-                          half * (fetch(1, 0, 1) - fetch(-1, 0, 1)),
-                          half * (fetch(2, 0, 1) - fetch(0, 0, 1)),
-                          half * (fetch(1, 1, 1) - fetch(-1, 1, 1)),
-                          half * (fetch(2, 1, 1) - fetch(0, 1, 1)),
-                          // values of df/dy at the 8 corners (each from 2 stencil values).
-                          half * (fetch(0, 1, 0) - fetch(0, -1, 0)),
-                          half * (fetch(1, 1, 0) - fetch(1, -1, 0)),
-                          half * (fetch(0, 2, 0) - fetch(0, 0, 0)),
-                          half * (fetch(1, 2, 0) - fetch(1, 0, 0)),
-                          half * (fetch(0, 1, 1) - fetch(0, -1, 1)),
-                          half * (fetch(1, 1, 1) - fetch(1, -1, 1)),
-                          half * (fetch(0, 2, 1) - fetch(0, 0, 1)),
-                          half * (fetch(1, 2, 1) - fetch(1, 0, 1)),
-                          // values of df/dz at the 8 corners (each from 2 stencil values).
-                          half * (fetch(0, 0, 1) - fetch(0, 0, -1)),
-                          half * (fetch(1, 0, 1) - fetch(1, 0, -1)),
-                          half * (fetch(0, 1, 1) - fetch(0, 1, -1)),
-                          half * (fetch(1, 1, 1) - fetch(1, 1, -1)),
-                          half * (fetch(0, 0, 2) - fetch(0, 0, 0)),
-                          half * (fetch(1, 0, 2) - fetch(1, 0, 0)),
-                          half * (fetch(0, 1, 2) - fetch(0, 1, 0)),
-                          half * (fetch(1, 1, 2) - fetch(1, 1, 0)),
-                          // values of d2f/dxdy at the 8 corners (each from 4 stencil values).
-                          quarter * (fetch(1, 1, 0) - fetch(-1, 1, 0) - fetch(1, -1, 0) + fetch(-1, -1, 0)),
-                          quarter * (fetch(2, 1, 0) - fetch(0, 1, 0) - fetch(2, -1, 0) + fetch(0, -1, 0)),
-                          quarter * (fetch(1, 2, 0) - fetch(-1, 2, 0) - fetch(1, 0, 0) + fetch(-1, 0, 0)),
-                          quarter * (fetch(2, 2, 0) - fetch(0, 2, 0) - fetch(2, 0, 0) + fetch(0, 0, 0)),
-                          quarter * (fetch(1, 1, 1) - fetch(-1, 1, 1) - fetch(1, -1, 1) + fetch(-1, -1, 1)),
-                          quarter * (fetch(2, 1, 1) - fetch(0, 1, 1) - fetch(2, -1, 1) + fetch(0, -1, 1)),
-                          quarter * (fetch(1, 2, 1) - fetch(-1, 2, 1) - fetch(1, 0, 1) + fetch(-1, 0, 1)),
-                          quarter * (fetch(2, 2, 1) - fetch(0, 2, 1) - fetch(2, 0, 1) + fetch(0, 0, 1)),
-                          // values of d2f/dxdz at the 8 corners (each from 4 stencil values).
-                          quarter * (fetch(1, 0, 1) - fetch(-1, 0, 1) - fetch(1, 0, -1) + fetch(-1, 0, -1)),
-                          quarter * (fetch(2, 0, 1) - fetch(0, 0, 1) - fetch(2, 0, -1) + fetch(0, 0, -1)),
-                          quarter * (fetch(1, 1, 1) - fetch(-1, 1, 1) - fetch(1, 1, -1) + fetch(-1, 1, -1)),
-                          quarter * (fetch(2, 1, 1) - fetch(0, 1, 1) - fetch(2, 1, -1) + fetch(0, 1, -1)),
-                          quarter * (fetch(1, 0, 2) - fetch(-1, 0, 2) - fetch(1, 0, 0) + fetch(-1, 0, 0)),
-                          quarter * (fetch(2, 0, 2) - fetch(0, 0, 2) - fetch(2, 0, 0) + fetch(0, 0, 0)),
-                          quarter * (fetch(1, 1, 2) - fetch(-1, 1, 2) - fetch(1, 1, 0) + fetch(-1, 1, 0)),
-                          quarter * (fetch(2, 1, 2) - fetch(0, 1, 2) - fetch(2, 1, 0) + fetch(0, 1, 0)),
-                          // values of d2f/dydz at the 8 corners (each from 4 stencil values).
-                          quarter * (fetch(0, 1, 1) - fetch(0, -1, 1) - fetch(0, 1, -1) + fetch(0, -1, -1)),
-                          quarter * (fetch(1, 1, 1) - fetch(1, -1, 1) - fetch(1, 1, -1) + fetch(1, -1, -1)),
-                          quarter * (fetch(0, 2, 1) - fetch(0, 0, 1) - fetch(0, 2, -1) + fetch(0, 0, -1)),
-                          quarter * (fetch(1, 2, 1) - fetch(1, 0, 1) - fetch(1, 2, -1) + fetch(1, 0, -1)),
-                          quarter * (fetch(0, 1, 2) - fetch(0, -1, 2) - fetch(0, 1, 0) + fetch(0, -1, 0)),
-                          quarter * (fetch(1, 1, 2) - fetch(1, -1, 2) - fetch(1, 1, 0) + fetch(1, -1, 0)),
-                          quarter * (fetch(0, 2, 2) - fetch(0, 0, 2) - fetch(0, 2, 0) + fetch(0, 0, 0)),
-                          quarter * (fetch(1, 2, 2) - fetch(1, 0, 2) - fetch(1, 2, 0) + fetch(1, 0, 0)),
-                          // values of d3f/dxdydz at the 8 corners (each from 8 stencil values).
-                          eighth * (fetch(1, 1, 1) - fetch(-1, 1, 1) - fetch(1, -1, 1) + fetch(-1, -1, 1) - fetch(1, 1, -1) + fetch(-1, 1, -1) + fetch(1, -1, -1) - fetch(-1, -1, -1)),
-                          eighth * (fetch(2, 1, 1) - fetch(0, 1, 1) - fetch(2, -1, 1) + fetch(0, -1, 1) - fetch(2, 1, -1) + fetch(0, 1, -1) + fetch(2, -1, -1) - fetch(0, -1, -1)),
-                          eighth * (fetch(1, 2, 1) - fetch(-1, 2, 1) - fetch(1, 0, 1) + fetch(-1, 0, 1) - fetch(1, 2, -1) + fetch(-1, 2, -1) + fetch(1, 0, -1) - fetch(-1, 0, -1)),
-                          eighth * (fetch(2, 2, 1) - fetch(0, 2, 1) - fetch(2, 0, 1) + fetch(0, 0, 1) - fetch(2, 2, -1) + fetch(0, 2, -1) + fetch(2, 0, -1) - fetch(0, 0, -1)),
-                          eighth * (fetch(1, 1, 2) - fetch(-1, 1, 2) - fetch(1, -1, 2) + fetch(-1, -1, 2) - fetch(1, 1, 0) + fetch(-1, 1, 0) + fetch(1, -1, 0) - fetch(-1, -1, 0)),
-                          eighth * (fetch(2, 1, 2) - fetch(0, 1, 2) - fetch(2, -1, 2) + fetch(0, -1, 2) - fetch(2, 1, 0) + fetch(0, 1, 0) + fetch(2, -1, 0) - fetch(0, -1, 0)),
-                          eighth * (fetch(1, 2, 2) - fetch(-1, 2, 2) - fetch(1, 0, 2) + fetch(-1, 0, 2) - fetch(1, 2, 0) + fetch(-1, 2, 0) + fetch(1, 0, 0) - fetch(-1, 0, 0)),
-                          eighth * (fetch(2, 2, 2) - fetch(0, 2, 2) - fetch(2, 0, 2) + fetch(0, 0, 2) - fetch(2, 2, 0) + fetch(0, 2, 0) + fetch(2, 0, 0) - fetch(0, 0, 0))};
-
-    // 4Kb of static table (int8_t has a range of -127 -> 127 which suffices)
-    static const int8_t A[64][64] = {
-        {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {-3, 3, 0, 0, 0, 0, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {2, -2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {9, -9, -9, 9, 0, 0, 0, 0, 6, 3, -6, -3, 0, 0, 0, 0, 6, -6, 3, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {-6, 6, 6, -6, 0, 0, 0, 0, -3, -3, 3, 3, 0, 0, 0, 0, -4, 4, -2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -2, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {-6, 6, 6, -6, 0, 0, 0, 0, -4, -2, 4, 2, 0, 0, 0, 0, -3, 3, -3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -1, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {4, -4, -4, 4, 0, 0, 0, 0, 2, 2, -2, -2, 0, 0, 0, 0, 2, -2, 2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, 0, 0, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, -9, -9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 3, -6, -3, 0, 0, 0, 0, 6, -6, 3, -3, 0, 0, 0, 0, 4, 2, 2, 1, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -6, 6, 6, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, 3, 3, 0, 0, 0, 0, -4, 4, -2, 2, 0, 0, 0, 0, -2, -2, -1, -1, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -6, 6, 6, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, -2, 4, 2, 0, 0, 0, 0, -3, 3, -3, 3, 0, 0, 0, 0, -2, -1, -2, -1, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, -4, -4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, -2, -2, 0, 0, 0, 0, 2, -2, 2, -2, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0},
-        {-3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {9, -9, 0, 0, -9, 9, 0, 0, 6, 3, 0, 0, -6, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, -6, 0, 0, 3, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 2, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {-6, 6, 0, 0, 6, -6, 0, 0, -3, -3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, 4, 0, 0, -2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -2, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, 0, 0, -1, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, -9, 0, 0, -9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 3, 0, 0, -6, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, -6, 0, 0, 3, -3, 0, 0, 4, 2, 0, 0, 2, 1, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -6, 6, 0, 0, 6, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, 4, 0, 0, -2, 2, 0, 0, -2, -2, 0, 0, -1, -1, 0, 0},
-        {9, 0, -9, 0, -9, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3, 0, -6, 0, -3, 0, 6, 0, -6, 0, 3, 0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 9, 0, -9, 0, -9, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3, 0, -6, 0, -3, 0, 6, 0, -6, 0, 3, 0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 2, 0, 2, 0, 1, 0},
-        {-27, 27, 27, -27, 27, -27, -27, 27, -18, -9, 18, 9, 18, 9, -18, -9, -18, 18, -9, 9, 18, -18, 9, -9, -18, 18, 18, -18, -9, 9, 9, -9, -12, -6, -6, -3, 12, 6, 6, 3, -12, -6, 12, 6, -6, -3, 6, 3, -12, 12, -6, 6, -6, 6, -3, 3, -8, -4, -4, -2, -4, -2, -2, -1},
-        {18, -18, -18, 18, -18, 18, 18, -18, 9, 9, -9, -9, -9, -9, 9, 9, 12, -12, 6, -6, -12, 12, -6, 6, 12, -12, -12, 12, 6, -6, -6, 6, 6, 6, 3, 3, -6, -6, -3, -3, 6, 6, -6, -6, 3, 3, -3, -3, 8, -8, 4, -4, 4, -4, 2, -2, 4, 4, 2, 2, 2, 2, 1, 1},
-        {-6, 0, 6, 0, 6, 0, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, -3, 0, 3, 0, 3, 0, -4, 0, 4, 0, -2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -2, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, -6, 0, 6, 0, 6, 0, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, -3, 0, 3, 0, 3, 0, -4, 0, 4, 0, -2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -2, 0, -1, 0, -1, 0},
-        {18, -18, -18, 18, -18, 18, 18, -18, 12, 6, -12, -6, -12, -6, 12, 6, 9, -9, 9, -9, -9, 9, -9, 9, 12, -12, -12, 12, 6, -6, -6, 6, 6, 3, 6, 3, -6, -3, -6, -3, 8, 4, -8, -4, 4, 2, -4, -2, 6, -6, 6, -6, 3, -3, 3, -3, 4, 2, 4, 2, 2, 1, 2, 1},
-        {-12, 12, 12, -12, 12, -12, -12, 12, -6, -6, 6, 6, 6, 6, -6, -6, -6, 6, -6, 6, 6, -6, 6, -6, -8, 8, 8, -8, -4, 4, 4, -4, -3, -3, -3, -3, 3, 3, 3, 3, -4, -4, 4, 4, -2, -2, 2, 2, -4, 4, -4, 4, -2, 2, -2, 2, -2, -2, -2, -2, -1, -1, -1, -1},
-        {2, 0, 0, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {-6, 6, 0, 0, 6, -6, 0, 0, -4, -2, 0, 0, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, -3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -1, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {4, -4, 0, 0, -4, 4, 0, 0, 2, 2, 0, 0, -2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -6, 6, 0, 0, 6, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, -2, 0, 0, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, -3, 3, 0, 0, -2, -1, 0, 0, -2, -1, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, -4, 0, 0, -4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, -2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 2, -2, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0},
-        {-6, 0, 6, 0, 6, 0, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, 0, -2, 0, 4, 0, 2, 0, -3, 0, 3, 0, -3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, -2, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, -6, 0, 6, 0, 6, 0, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, 0, -2, 0, 4, 0, 2, 0, -3, 0, 3, 0, -3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, -2, 0, -1, 0},
-        {18, -18, -18, 18, -18, 18, 18, -18, 12, 6, -12, -6, -12, -6, 12, 6, 12, -12, 6, -6, -12, 12, -6, 6, 9, -9, -9, 9, 9, -9, -9, 9, 8, 4, 4, 2, -8, -4, -4, -2, 6, 3, -6, -3, 6, 3, -6, -3, 6, -6, 3, -3, 6, -6, 3, -3, 4, 2, 2, 1, 4, 2, 2, 1},
-        {-12, 12, 12, -12, 12, -12, -12, 12, -6, -6, 6, 6, 6, 6, -6, -6, -8, 8, -4, 4, 8, -8, 4, -4, -6, 6, 6, -6, -6, 6, 6, -6, -4, -4, -2, -2, 4, 4, 2, 2, -3, -3, 3, 3, -3, -3, 3, 3, -4, 4, -2, 2, -4, 4, -2, 2, -2, -2, -1, -1, -2, -2, -1, -1},
-        {4, 0, -4, 0, -4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, -2, 0, -2, 0, 2, 0, -2, 0, 2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 0, 0, 0, 0, 0, 0, 4, 0, -4, 0, -4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, -2, 0, -2, 0, 2, 0, -2, 0, 2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0},
-        {-12, 12, 12, -12, 12, -12, -12, 12, -8, -4, 8, 4, 8, 4, -8, -4, -6, 6, -6, 6, 6, -6, 6, -6, -6, 6, 6, -6, -6, 6, 6, -6, -4, -2, -4, -2, 4, 2, 4, 2, -4, -2, 4, 2, -4, -2, 4, 2, -3, 3, -3, 3, -3, 3, -3, 3, -2, -1, -2, -1, -2, -1, -2, -1},
-        {8, -8, -8, 8, -8, 8, 8, -8, 4, 4, -4, -4, -4, -4, 4, 4, 4, -4, 4, -4, -4, 4, -4, 4, 4, -4, -4, 4, 4, -4, -4, 4, 2, 2, 2, 2, -2, -2, -2, -2, 2, 2, -2, -2, 2, 2, -2, -2, 2, -2, 2, -2, 2, -2, 2, -2, 1, 1, 1, 1, 1, 1, 1, 1}};
-
-    for (int i = 0; i < 64; ++i) { // C = A * X
-        C[i] = ValueT(0);
-#if 0
-    for (int j = 0; j < 64; j += 4) {
-      C[i] = fma(A[i][j], X[j], fma(A[i][j+1], X[j+1], fma(A[i][j+2], X[j+2], fma(A[i][j+3], X[j+3], C[i]))));
-    }
-#else
-        for (int j = 0; j < 64; j += 4) {
-            C[i] += A[i][j] * X[j] + A[i][j + 1] * X[j + 1] + A[i][j + 2] * X[j + 2] + A[i][j + 3] * X[j + 3];
-        }
-#endif
-    }
-}
-
-template<typename TreeOrAccT>
-template<typename RealT, template<typename...> class Vec3T>
-__hostdev__ typename TreeOrAccT::ValueType TricubicSampler<TreeOrAccT>::sample(const Vec3T<RealT> &xyz, const ValueT (&C)[64])
-{
-    ValueT zPow(1), sum(0);
-    for (int k = 0, n = 0; k < 4; ++k) {
-        ValueT yPow(1);
-        for (int j = 0; j < 4; ++j, n += 4) {
-#if 0
-            sum = fma( yPow, zPow * fma(xyz[0], fma(xyz[0], fma(xyz[0], C[n + 3], C[n + 2]), C[n + 1]), C[n]), sum);
-#else
-            sum += yPow * zPow * (C[n] + xyz[0] * (C[n + 1] + xyz[0] * (C[n + 2] + xyz[0] * C[n + 3])));
-#endif
-            yPow *= xyz[1];
-        }
-        zPow *= xyz[2];
-    }
-    return sum;
-}
-
-template<typename TreeOrAccT>
-class SampleFromVoxels<TreeOrAccT, 3, true> : public TricubicSampler<TreeOrAccT>
-{
-    using BaseT  = TricubicSampler<TreeOrAccT>;
-    using ValueT = typename TreeOrAccT::ValueType;
-    using CoordT = typename TreeOrAccT::CoordType;
-
-    mutable CoordT mPos;
-    mutable ValueT mC[64];
-
-    template<typename RealT, template<typename...> class Vec3T>
-    __hostdev__ void cache(Vec3T<RealT>& xyz) const;
-
-public:
-    /// @brief Construction from a Tree or ReadAccessor
-    __hostdev__ SampleFromVoxels(const TreeOrAccT& acc)
-        : BaseT(acc)
-    {
-    }
-
-    /// @note xyz is in index space space
-    template<typename RealT, template<typename...> class Vec3T>
-    inline __hostdev__ ValueT operator()(Vec3T<RealT> xyz) const;
-
-    // @brief Return value at the coordinate @a ijk in index space space
-    __hostdev__ ValueT operator()(const CoordT &ijk) const {return BaseT::mAcc.getValue(ijk);}
-
-}; // SampleFromVoxels<TreeOrAccT, 3, true>
-
-template<typename TreeOrAccT>
-template<typename RealT, template<typename...> class Vec3T>
-__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 3, true>::operator()(Vec3T<RealT> xyz) const
-{
-    this->cache(xyz);
-    return BaseT::sample(xyz, mC);
-}
-
-template<typename TreeOrAccT>
-template<typename RealT, template<typename...> class Vec3T>
-__hostdev__ void SampleFromVoxels<TreeOrAccT, 3, true>::cache(Vec3T<RealT>& xyz) const
-{
-    CoordT ijk = Floor<CoordT>(xyz);
-    if (ijk != mPos) {
-        mPos = ijk;
-        BaseT::stencil(ijk, mC);
-    }
-}
-
-template<typename TreeOrAccT>
-class SampleFromVoxels<TreeOrAccT, 3, false> : public TricubicSampler<TreeOrAccT>
-{
-    using BaseT  = TricubicSampler<TreeOrAccT>;
-    using ValueT = typename TreeOrAccT::ValueType;
-    using CoordT = typename TreeOrAccT::CoordType;
-
-public:
-    /// @brief Construction from a Tree or ReadAccessor
-    __hostdev__ SampleFromVoxels(const TreeOrAccT& acc)
-        : BaseT(acc)
-    {
-    }
-
-    /// @note xyz is in index space space
-    template<typename RealT, template<typename...> class Vec3T>
-    inline __hostdev__ ValueT operator()(Vec3T<RealT> xyz) const;
-
-    __hostdev__ ValueT operator()(const CoordT &ijk) const {return BaseT::mAcc.getValue(ijk);}
-
-}; // SampleFromVoxels<TreeOrAccT, 3, true>
-
-template<typename TreeOrAccT>
-template<typename RealT, template<typename...> class Vec3T>
-__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 3, false>::operator()(Vec3T<RealT> xyz) const
-{
-    ValueT C[64];
-    CoordT ijk = Floor<CoordT>(xyz);
-    BaseT::stencil(ijk, C);
-    return BaseT::sample(xyz, C);
-}
-
-} // namespace nanovdb
-
-#endif // NANOVDB_SAMPLE_FROM_VOXELS_H_HAS_BEEN_INCLUDED
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/math/SampleFromVoxels.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/math/SampleFromVoxels.h instead.")
diff --git a/nanovdb/nanovdb/util/Stencils.h b/nanovdb/nanovdb/util/Stencils.h
index 88e943f4ff..c93b4a15cf 100644
--- a/nanovdb/nanovdb/util/Stencils.h
+++ b/nanovdb/nanovdb/util/Stencils.h
@@ -1,1028 +1,6 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
-//
-/// @author Ken Museth
-///
-/// @date  April 9, 2021
-///
-/// @file Stencils.h
-///
-/// @brief Defines various finite-difference stencils that allow for the
-///        computation of gradients of order 1 to 5, mean curvatures,
-///        gaussian curvatures, principal curvatures, tri-linear interpolation,
-///        zero-crossing, laplacian, and closest point transform.
 
-#ifndef NANOVDB_STENCILS_HAS_BEEN_INCLUDED
-#define NANOVDB_STENCILS_HAS_BEEN_INCLUDED
-
-#include <nanovdb/NanoVDB.h>// for __hostdev__, Vec3, Min, Max, Pow2, Pow3, Pow4
-
-namespace nanovdb {
-
-// ---------------------------- WENO5 ----------------------------
-
-/// @brief Implementation of nominally fifth-order finite-difference WENO
-/// @details This function returns the numerical flux.  See "High Order Finite Difference and
-/// Finite Volume WENO Schemes and Discontinuous Galerkin Methods for CFD" - Chi-Wang Shu
-/// ICASE Report No 2001-11 (page 6).  Also see ICASE No 97-65 for a more complete reference
-/// (Shu, 1997).
-/// Given v1 = f(x-2dx), v2 = f(x-dx), v3 = f(x), v4 = f(x+dx) and v5 = f(x+2dx),
-/// return an interpolated value f(x+dx/2) with the special property that
-/// ( f(x+dx/2) - f(x-dx/2) ) / dx  = df/dx (x) + error,
-/// where the error is fifth-order in smooth regions: O(dx) <= error <=O(dx^5)
-template<typename ValueType, typename RealT = ValueType>
-__hostdev__ inline ValueType
-WENO5(const ValueType& v1,
-      const ValueType& v2,
-      const ValueType& v3,
-      const ValueType& v4,
-      const ValueType& v5,
-      RealT scale2 = 1.0)// openvdb uses scale2 = 0.01
-{
-    static const RealT C = 13.0 / 12.0;
-    // WENO is formulated for non-dimensional equations, here the optional scale2
-    // is a reference value (squared) for the function being interpolated.  For
-    // example if 'v' is of order 1000, then scale2 = 10^6 is ok.  But in practice
-    // leave scale2 = 1.
-    const RealT eps = RealT(1.0e-6) * scale2;
-    // {\tilde \omega_k} = \gamma_k / ( \beta_k + \epsilon)^2 in Shu's ICASE report)
-    const RealT A1 = RealT(0.1)/Pow2(C*Pow2(v1-2*v2+v3)+RealT(0.25)*Pow2(v1-4*v2+3*v3)+eps),
-                A2 = RealT(0.6)/Pow2(C*Pow2(v2-2*v3+v4)+RealT(0.25)*Pow2(v2-v4)+eps),
-                A3 = RealT(0.3)/Pow2(C*Pow2(v3-2*v4+v5)+RealT(0.25)*Pow2(3*v3-4*v4+v5)+eps);
-
-    return static_cast<ValueType>((A1*(2*v1 - 7*v2 + 11*v3) +
-                                   A2*(5*v3 -   v2 +  2*v4) +
-                                   A3*(2*v3 + 5*v4 -    v5))/(6*(A1+A2+A3)));
-}
-
-// ---------------------------- GodunovsNormSqrd ----------------------------
-
-template <typename RealT>
-__hostdev__ inline RealT
-GodunovsNormSqrd(bool isOutside,
-                 RealT dP_xm, RealT dP_xp,
-                 RealT dP_ym, RealT dP_yp,
-                 RealT dP_zm, RealT dP_zp)
-{
-    RealT dPLen2;
-    if (isOutside) { // outside
-        dPLen2  = Max(Pow2(Max(dP_xm, RealT(0))), Pow2(Min(dP_xp, RealT(0)))); // (dP/dx)2
-        dPLen2 += Max(Pow2(Max(dP_ym, RealT(0))), Pow2(Min(dP_yp, RealT(0)))); // (dP/dy)2
-        dPLen2 += Max(Pow2(Max(dP_zm, RealT(0))), Pow2(Min(dP_zp, RealT(0)))); // (dP/dz)2
-    } else { // inside
-        dPLen2  = Max(Pow2(Min(dP_xm, RealT(0))), Pow2(Max(dP_xp, RealT(0)))); // (dP/dx)2
-        dPLen2 += Max(Pow2(Min(dP_ym, RealT(0))), Pow2(Max(dP_yp, RealT(0)))); // (dP/dy)2
-        dPLen2 += Max(Pow2(Min(dP_zm, RealT(0))), Pow2(Max(dP_zp, RealT(0)))); // (dP/dz)2
-    }
-    return dPLen2; // |\nabla\phi|^2
-}
-
-template<typename RealT>
-__hostdev__ inline RealT
-GodunovsNormSqrd(bool isOutside,
-                 const Vec3<RealT>& gradient_m,
-                 const Vec3<RealT>& gradient_p)
-{
-    return GodunovsNormSqrd<RealT>(isOutside,
-                                   gradient_m[0], gradient_p[0],
-                                   gradient_m[1], gradient_p[1],
-                                   gradient_m[2], gradient_p[2]);
-}
-
-// ---------------------------- BaseStencil ----------------------------
-
-// BaseStencil uses curiously recurring template pattern (CRTP)
-template<typename DerivedType, int SIZE, typename GridT>
-class BaseStencil
-{
-public:
-    using ValueType = typename GridT::ValueType;
-    using GridType  = GridT;
-    using TreeType  = typename GridT::TreeType;
-    using AccessorType = typename GridT::AccessorType;// ReadAccessor<ValueType>;
-
-    /// @brief Initialize the stencil buffer with the values of voxel (i, j, k)
-    /// and its neighbors.
-    /// @param ijk Index coordinates of stencil center
-    __hostdev__ inline void moveTo(const Coord& ijk)
-    {
-        mCenter = ijk;
-        mValues[0] = mAcc.getValue(ijk);
-        static_cast<DerivedType&>(*this).init(mCenter);
-    }
-
-    /// @brief Initialize the stencil buffer with the values of voxel (i, j, k)
-    /// and its neighbors. The method also takes a value of the center
-    /// element of the stencil, assuming it is already known.
-    /// @param ijk Index coordinates of stencil center
-    /// @param centerValue Value of the center element of the stencil
-    __hostdev__ inline void moveTo(const Coord& ijk, const ValueType& centerValue)
-    {
-        mCenter = ijk;
-        mValues[0] = centerValue;
-        static_cast<DerivedType&>(*this).init(mCenter);
-    }
-
-    /// @brief Initialize the stencil buffer with the values of voxel
-    /// (x, y, z) and its neighbors.
-    ///
-    /// @note This version is slightly faster than the one above, since
-    /// the center voxel's value is read directly from the iterator.
-    template<typename IterType>
-    __hostdev__ inline void moveTo(const IterType& iter)
-    {
-        mCenter = iter.getCoord();
-        mValues[0] = *iter;
-        static_cast<DerivedType&>(*this).init(mCenter);
-    }
-
-    /// @brief Initialize the stencil buffer with the values of voxel (x, y, z)
-    /// and its neighbors.
-    /// @param xyz Floating point voxel coordinates of stencil center
-    /// @details This method will check to see if it is necessary to
-    /// update the stencil based on the cached index coordinates of
-    /// the center point.
-    template<typename RealType>
-    __hostdev__ inline void moveTo(const Vec3<RealType>& xyz)
-    {
-        Coord ijk = RoundDown(xyz);
-        if (ijk != mCenter) this->moveTo(ijk);
-    }
-
-    /// @brief Return the value from the stencil buffer with linear
-    /// offset pos.
-    ///
-    /// @note The default (@a pos = 0) corresponds to the first element
-    /// which is typically the center point of the stencil.
-    __hostdev__ inline const ValueType& getValue(unsigned int pos = 0) const
-    {
-        NANOVDB_ASSERT(pos < SIZE);
-        return mValues[pos];
-    }
-
-    /// @brief Return the value at the specified location relative to the center of the stencil
-    template<int i, int j, int k>
-    __hostdev__ inline const ValueType& getValue() const
-    {
-        return mValues[static_cast<const DerivedType&>(*this).template pos<i,j,k>()];
-    }
-
-    /// @brief Set the value at the specified location relative to the center of the stencil
-    template<int i, int j, int k>
-    __hostdev__ inline void setValue(const ValueType& value)
-    {
-        mValues[static_cast<const DerivedType&>(*this).template pos<i,j,k>()] = value;
-    }
-
-    /// @brief Return the size of the stencil buffer.
-    __hostdev__ static int size() { return SIZE; }
-
-    /// @brief Return the mean value of the current stencil.
-    __hostdev__ inline ValueType mean() const
-    {
-        ValueType sum = 0.0;
-        for (int i = 0; i < SIZE; ++i) sum += mValues[i];
-        return sum / ValueType(SIZE);
-    }
-
-    /// @brief Return the smallest value in the stencil buffer.
-    __hostdev__ inline ValueType min() const
-    {
-        ValueType v = mValues[0];
-        for (int i=1; i<SIZE; ++i) {
-            if (mValues[i] < v) v = mValues[i];
-        }
-        return v;
-    }
-
-    /// @brief Return the largest value in the stencil buffer.
-    __hostdev__ inline ValueType max() const
-    {
-        ValueType v = mValues[0];
-        for (int i=1; i<SIZE; ++i) {
-            if (mValues[i] > v) v = mValues[i];
-        }
-        return v;
-    }
-
-    /// @brief Return the coordinates of the center point of the stencil.
-    __hostdev__ inline const Coord& getCenterCoord() const { return mCenter; }
-
-    /// @brief Return the value at the center of the stencil
-    __hostdev__ inline const ValueType& getCenterValue() const { return mValues[0]; }
-
-    /// @brief Return true if the center of the stencil intersects the
-    /// iso-contour specified by the isoValue
-    __hostdev__ inline bool intersects(const ValueType &isoValue = ValueType(0) ) const
-    {
-        const bool less = this->getValue< 0, 0, 0>() < isoValue;
-        return (less  ^  (this->getValue<-1, 0, 0>() < isoValue)) ||
-               (less  ^  (this->getValue< 1, 0, 0>() < isoValue)) ||
-               (less  ^  (this->getValue< 0,-1, 0>() < isoValue)) ||
-               (less  ^  (this->getValue< 0, 1, 0>() < isoValue)) ||
-               (less  ^  (this->getValue< 0, 0,-1>() < isoValue)) ||
-               (less  ^  (this->getValue< 0, 0, 1>() < isoValue))  ;
-    }
-    struct Mask {
-        uint8_t bits;
-        __hostdev__ Mask() : bits(0u) {}
-        __hostdev__ void set(int i) { bits |= (1 << i); }
-        __hostdev__ bool test(int i) const { return bits & (1 << i); }
-        __hostdev__ bool any() const  { return bits >  0u; }
-        __hostdev__ bool all() const  { return bits == 255u; }
-        __hostdev__ bool none() const { return bits == 0u; }
-        __hostdev__ int count() const { return CountOn(bits); }
-    };// Mask
-
-    /// @brief Return true a bit-mask where the 6 lower bits indicates if the
-    /// center of the stencil intersects the iso-contour specified by the isoValue.
-    ///
-    /// @note There are 2^6 = 64 different possible cases, including no intersections!
-    ///
-    /// @details The ordering of bit mask is ( -x, +x, -y, +y, -z, +z ), so to
-    /// check if there is an intersection in -y use (mask & (1u<<2)) where mask is
-    /// ther return value from this function. To check if there are any
-    /// intersections use mask!=0u, and for no intersections use mask==0u.
-    /// To count the number of intersections use __builtin_popcount(mask).
-    __hostdev__ inline Mask intersectionMask(ValueType isoValue = ValueType(0)) const
-    {
-        Mask mask;
-        const bool less = this->getValue< 0, 0, 0>() < isoValue;
-        if (less ^ (this->getValue<-1, 0, 0>() < isoValue)) mask.set(0);// |=  1u;
-        if (less ^ (this->getValue< 1, 0, 0>() < isoValue)) mask.set(1);// |=  2u;
-        if (less ^ (this->getValue< 0,-1, 0>() < isoValue)) mask.set(2);// |=  4u;
-        if (less ^ (this->getValue< 0, 1, 0>() < isoValue)) mask.set(3);// |=  8u;
-        if (less ^ (this->getValue< 0, 0,-1>() < isoValue)) mask.set(4);// |= 16u;
-        if (less ^ (this->getValue< 0, 0, 1>() < isoValue)) mask.set(5);// |= 32u;
-        return mask;
-    }
-
-    /// @brief Return a const reference to the grid from which this
-    /// stencil was constructed.
-    __hostdev__ inline const GridType& grid() const { return *mGrid; }
-
-    /// @brief Return a const reference to the ValueAccessor
-    /// associated with this Stencil.
-    __hostdev__ inline const AccessorType& accessor() const { return mAcc; }
-
-protected:
-    // Constructor is protected to prevent direct instantiation.
-    __hostdev__ BaseStencil(const GridType& grid)
-        : mGrid(&grid)
-        , mAcc(grid)
-        , mCenter(Coord::max())
-    {
-    }
-
-    const GridType* mGrid;
-    AccessorType    mAcc;
-    ValueType       mValues[SIZE];
-    Coord           mCenter;
-
-}; // BaseStencil class
-
-
-// ---------------------------- BoxStencil ----------------------------
-
-
-namespace { // anonymous namespace for stencil-layout map
-
-    // the eight point box stencil
-    template<int i, int j, int k> struct BoxPt {};
-    template<> struct BoxPt< 0, 0, 0> { enum { idx = 0 }; };
-    template<> struct BoxPt< 0, 0, 1> { enum { idx = 1 }; };
-    template<> struct BoxPt< 0, 1, 1> { enum { idx = 2 }; };
-    template<> struct BoxPt< 0, 1, 0> { enum { idx = 3 }; };
-    template<> struct BoxPt< 1, 0, 0> { enum { idx = 4 }; };
-    template<> struct BoxPt< 1, 0, 1> { enum { idx = 5 }; };
-    template<> struct BoxPt< 1, 1, 1> { enum { idx = 6 }; };
-    template<> struct BoxPt< 1, 1, 0> { enum { idx = 7 }; };
-
-}
-
-template<typename GridT>
-class BoxStencil: public BaseStencil<BoxStencil<GridT>, 8, GridT>
-{
-    using SelfT     = BoxStencil<GridT>;
-    using BaseType  = BaseStencil<SelfT, 8, GridT>;
-public:
-    using GridType  = GridT;
-    using TreeType  = typename GridT::TreeType;
-    using ValueType = typename GridT::ValueType;
-
-    static constexpr int SIZE = 8;
-
-    __hostdev__ BoxStencil(const GridType& grid) : BaseType(grid) {}
-
-    /// Return linear offset for the specified stencil point relative to its center
-    template<int i, int j, int k>
-    __hostdev__ unsigned int pos() const { return BoxPt<i,j,k>::idx; }
-
-     /// @brief Return true if the center of the stencil intersects the
-    /// iso-contour specified by the isoValue
-    __hostdev__ inline bool intersects(ValueType isoValue = ValueType(0)) const
-    {
-        const bool less = mValues[0] < isoValue;
-        return (less  ^  (mValues[1] < isoValue)) ||
-               (less  ^  (mValues[2] < isoValue)) ||
-               (less  ^  (mValues[3] < isoValue)) ||
-               (less  ^  (mValues[4] < isoValue)) ||
-               (less  ^  (mValues[5] < isoValue)) ||
-               (less  ^  (mValues[6] < isoValue)) ||
-               (less  ^  (mValues[7] < isoValue))  ;
-    }
-
-    /// @brief Return the trilinear interpolation at the normalized position.
-    /// @param xyz Floating point coordinate position. Index space and NOT world space.
-    /// @warning It is assumed that the stencil has already been moved
-    /// to the relevant voxel position, e.g. using moveTo(xyz).
-    /// @note Trilinear interpolation kernal reads as:
-    ///       v000 (1-u)(1-v)(1-w) + v001 (1-u)(1-v)w + v010 (1-u)v(1-w) + v011 (1-u)vw
-    ///     + v100 u(1-v)(1-w)     + v101 u(1-v)w     + v110 uv(1-w)     + v111 uvw
-    __hostdev__ inline ValueType interpolation(const Vec3<ValueType>& xyz) const
-    {
-        const ValueType u = xyz[0] - mCenter[0];
-        const ValueType v = xyz[1] - mCenter[1];
-        const ValueType w = xyz[2] - mCenter[2];
-
-        NANOVDB_ASSERT(u>=0 && u<=1);
-        NANOVDB_ASSERT(v>=0 && v<=1);
-        NANOVDB_ASSERT(w>=0 && w<=1);
-
-        ValueType V = BaseType::template getValue<0,0,0>();
-        ValueType A = V + (BaseType::template getValue<0,0,1>() - V) * w;
-        V = BaseType::template getValue< 0, 1, 0>();
-        ValueType B = V + (BaseType::template getValue<0,1,1>() - V) * w;
-        ValueType C = A + (B - A) * v;
-
-        V = BaseType::template getValue<1,0,0>();
-        A = V + (BaseType::template getValue<1,0,1>() - V) * w;
-        V = BaseType::template getValue<1,1,0>();
-        B = V + (BaseType::template getValue<1,1,1>() - V) * w;
-        ValueType D = A + (B - A) * v;
-
-        return C + (D - C) * u;
-    }
-
-    /// @brief Return the gradient in world space of the trilinear interpolation kernel.
-    /// @param xyz Floating point coordinate position.
-    /// @warning It is assumed that the stencil has already been moved
-    /// to the relevant voxel position, e.g. using moveTo(xyz).
-    /// @note Computed as partial derivatives of the trilinear interpolation kernel:
-    ///       v000 (1-u)(1-v)(1-w) + v001 (1-u)(1-v)w + v010 (1-u)v(1-w) + v011 (1-u)vw
-    ///     + v100 u(1-v)(1-w)     + v101 u(1-v)w     + v110 uv(1-w)     + v111 uvw
-    __hostdev__ inline Vec3<ValueType> gradient(const Vec3<ValueType>& xyz) const
-    {
-        const ValueType u = xyz[0] - mCenter[0];
-        const ValueType v = xyz[1] - mCenter[1];
-        const ValueType w = xyz[2] - mCenter[2];
-
-        NANOVDB_ASSERT(u>=0 && u<=1);
-        NANOVDB_ASSERT(v>=0 && v<=1);
-        NANOVDB_ASSERT(w>=0 && w<=1);
-
-        ValueType D[4]={BaseType::template getValue<0,0,1>()-BaseType::template getValue<0,0,0>(),
-                        BaseType::template getValue<0,1,1>()-BaseType::template getValue<0,1,0>(),
-                        BaseType::template getValue<1,0,1>()-BaseType::template getValue<1,0,0>(),
-                        BaseType::template getValue<1,1,1>()-BaseType::template getValue<1,1,0>()};
-
-        // Z component
-        ValueType A = D[0] + (D[1]- D[0]) * v;
-        ValueType B = D[2] + (D[3]- D[2]) * v;
-        Vec3<ValueType> grad(0, 0, A + (B - A) * u);
-
-        D[0] = BaseType::template getValue<0,0,0>() + D[0] * w;
-        D[1] = BaseType::template getValue<0,1,0>() + D[1] * w;
-        D[2] = BaseType::template getValue<1,0,0>() + D[2] * w;
-        D[3] = BaseType::template getValue<1,1,0>() + D[3] * w;
-
-        // X component
-        A = D[0] + (D[1] - D[0]) * v;
-        B = D[2] + (D[3] - D[2]) * v;
-
-        grad[0] = B - A;
-
-        // Y component
-        A = D[1] - D[0];
-        B = D[3] - D[2];
-
-        grad[1] = A + (B - A) * u;
-
-        return BaseType::mGrid->map().applyIJT(grad);
-    }
-
-private:
-    __hostdev__ inline void init(const Coord& ijk)
-    {
-        mValues[ 1] = mAcc.getValue(ijk.offsetBy( 0, 0, 1));
-        mValues[ 2] = mAcc.getValue(ijk.offsetBy( 0, 1, 1));
-        mValues[ 3] = mAcc.getValue(ijk.offsetBy( 0, 1, 0));
-        mValues[ 4] = mAcc.getValue(ijk.offsetBy( 1, 0, 0));
-        mValues[ 5] = mAcc.getValue(ijk.offsetBy( 1, 0, 1));
-        mValues[ 6] = mAcc.getValue(ijk.offsetBy( 1, 1, 1));
-        mValues[ 7] = mAcc.getValue(ijk.offsetBy( 1, 1, 0));
-    }
-
-    template<typename, int, typename> friend class BaseStencil; // allow base class to call init()
-    using BaseType::mAcc;
-    using BaseType::mValues;
-    using BaseType::mCenter;
-};// BoxStencil class
-
-
-// ---------------------------- GradStencil ----------------------------
-
-namespace { // anonymous namespace for stencil-layout map
-
-    template<int i, int j, int k> struct GradPt {};
-    template<> struct GradPt< 0, 0, 0> { enum { idx = 0 }; };
-    template<> struct GradPt< 1, 0, 0> { enum { idx = 2 }; };
-    template<> struct GradPt< 0, 1, 0> { enum { idx = 4 }; };
-    template<> struct GradPt< 0, 0, 1> { enum { idx = 6 }; };
-    template<> struct GradPt<-1, 0, 0> { enum { idx = 1 }; };
-    template<> struct GradPt< 0,-1, 0> { enum { idx = 3 }; };
-    template<> struct GradPt< 0, 0,-1> { enum { idx = 5 }; };
-}
-
-/// This is a simple 7-point nearest neighbor stencil that supports
-/// gradient by second-order central differencing, first-order upwinding,
-/// Laplacian, closest-point transform and zero-crossing test.
-///
-/// @note For optimal random access performance this class
-/// includes its own grid accessor.
-template<typename GridT>
-class GradStencil : public BaseStencil<GradStencil<GridT>, 7, GridT>
-{
-    using SelfT     = GradStencil<GridT>;
-    using BaseType  = BaseStencil<SelfT, 7, GridT>;
-public:
-    using GridType  = GridT;
-    using TreeType  = typename GridT::TreeType;
-    using ValueType = typename GridT::ValueType;
-
-    static constexpr int SIZE = 7;
-
-    __hostdev__ GradStencil(const GridType& grid)
-        : BaseType(grid)
-        , mInv2Dx(ValueType(0.5 / grid.voxelSize()[0]))
-        , mInvDx2(ValueType(4.0 * mInv2Dx * mInv2Dx))
-    {
-    }
-
-    __hostdev__ GradStencil(const GridType& grid, double dx)
-        : BaseType(grid)
-        , mInv2Dx(ValueType(0.5 / dx))
-        , mInvDx2(ValueType(4.0 * mInv2Dx * mInv2Dx))
-    {
-    }
-
-    /// @brief Return the norm square of the single-sided upwind gradient
-    /// (computed via Godunov's scheme) at the previously buffered location.
-    ///
-    /// @note This method should not be called until the stencil
-    /// buffer has been populated via a call to moveTo(ijk).
-    __hostdev__ inline ValueType normSqGrad() const
-    {
-        return mInvDx2 * GodunovsNormSqrd(mValues[0] > ValueType(0),
-                                          mValues[0] - mValues[1],
-                                          mValues[2] - mValues[0],
-                                          mValues[0] - mValues[3],
-                                          mValues[4] - mValues[0],
-                                          mValues[0] - mValues[5],
-                                          mValues[6] - mValues[0]);
-    }
-
-    /// @brief Return the gradient computed at the previously buffered
-    /// location by second order central differencing.
-    ///
-    /// @note This method should not be called until the stencil
-    /// buffer has been populated via a call to moveTo(ijk).
-    __hostdev__ inline Vec3<ValueType> gradient() const
-    {
-        return Vec3<ValueType>(mValues[2] - mValues[1],
-                               mValues[4] - mValues[3],
-                               mValues[6] - mValues[5])*mInv2Dx;
-    }
-    /// @brief Return the first-order upwind gradient corresponding to the direction V.
-    ///
-    /// @note This method should not be called until the stencil
-    /// buffer has been populated via a call to moveTo(ijk).
-    __hostdev__ inline Vec3<ValueType> gradient(const Vec3<ValueType>& V) const
-    {
-        return Vec3<ValueType>(
-               V[0]>0 ? mValues[0] - mValues[1] : mValues[2] - mValues[0],
-               V[1]>0 ? mValues[0] - mValues[3] : mValues[4] - mValues[0],
-               V[2]>0 ? mValues[0] - mValues[5] : mValues[6] - mValues[0])*2*mInv2Dx;
-    }
-
-    /// Return the Laplacian computed at the previously buffered
-    /// location by second-order central differencing.
-    __hostdev__ inline ValueType laplacian() const
-    {
-        return mInvDx2 * (mValues[1] + mValues[2] +
-                          mValues[3] + mValues[4] +
-                          mValues[5] + mValues[6] - 6*mValues[0]);
-    }
-
-    /// Return @c true if the sign of the value at the center point of the stencil
-    /// is different from the signs of any of its six nearest neighbors.
-    __hostdev__ inline bool zeroCrossing() const
-    {
-        return (mValues[0]>0 ? (mValues[1]<0 || mValues[2]<0 || mValues[3]<0 || mValues[4]<0 || mValues[5]<0 || mValues[6]<0)
-                             : (mValues[1]>0 || mValues[2]>0 || mValues[3]>0 || mValues[4]>0 || mValues[5]>0 || mValues[6]>0));
-    }
-
-    /// @brief Compute the closest-point transform to a level set.
-    /// @return the closest point in index space to the surface
-    /// from which the level set was derived.
-    ///
-    /// @note This method assumes that the grid represents a level set
-    /// with distances in world units and a simple affine transfrom
-    /// with uniform scaling.
-    __hostdev__ inline Vec3<ValueType> cpt()
-    {
-        const Coord& ijk = BaseType::getCenterCoord();
-        const ValueType d = ValueType(mValues[0] * 0.5 * mInvDx2); // distance in voxels / (2dx^2)
-        const auto value = Vec3<ValueType>(ijk[0] - d*(mValues[2] - mValues[1]),
-                                           ijk[1] - d*(mValues[4] - mValues[3]),
-                                           ijk[2] - d*(mValues[6] - mValues[5]));
-        return value;
-    }
-
-    /// Return linear offset for the specified stencil point relative to its center
-    template<int i, int j, int k>
-    __hostdev__ unsigned int pos() const { return GradPt<i,j,k>::idx; }
-
-private:
-
-    __hostdev__ inline void init(const Coord& ijk)
-    {
-        mValues[ 1] = mAcc.getValue(ijk.offsetBy(-1, 0, 0));
-        mValues[ 2] = mAcc.getValue(ijk.offsetBy( 1, 0, 0));
-
-        mValues[ 3] = mAcc.getValue(ijk.offsetBy( 0,-1, 0));
-        mValues[ 4] = mAcc.getValue(ijk.offsetBy( 0, 1, 0));
-
-        mValues[ 5] = mAcc.getValue(ijk.offsetBy( 0, 0,-1));
-        mValues[ 6] = mAcc.getValue(ijk.offsetBy( 0, 0, 1));
-    }
-
-    template<typename, int, typename> friend class BaseStencil; // allow base class to call init()
-    using BaseType::mAcc;
-    using BaseType::mValues;
-    const ValueType mInv2Dx, mInvDx2;
-}; // GradStencil class
-
-
-// ---------------------------- WenoStencil ----------------------------
-
-namespace { // anonymous namespace for stencil-layout map
-
-    template<int i, int j, int k> struct WenoPt {};
-    template<> struct WenoPt< 0, 0, 0> { enum { idx = 0 }; };
-
-    template<> struct WenoPt<-3, 0, 0> { enum { idx = 1 }; };
-    template<> struct WenoPt<-2, 0, 0> { enum { idx = 2 }; };
-    template<> struct WenoPt<-1, 0, 0> { enum { idx = 3 }; };
-    template<> struct WenoPt< 1, 0, 0> { enum { idx = 4 }; };
-    template<> struct WenoPt< 2, 0, 0> { enum { idx = 5 }; };
-    template<> struct WenoPt< 3, 0, 0> { enum { idx = 6 }; };
-
-    template<> struct WenoPt< 0,-3, 0> { enum { idx = 7 }; };
-    template<> struct WenoPt< 0,-2, 0> { enum { idx = 8 }; };
-    template<> struct WenoPt< 0,-1, 0> { enum { idx = 9 }; };
-    template<> struct WenoPt< 0, 1, 0> { enum { idx =10 }; };
-    template<> struct WenoPt< 0, 2, 0> { enum { idx =11 }; };
-    template<> struct WenoPt< 0, 3, 0> { enum { idx =12 }; };
-
-    template<> struct WenoPt< 0, 0,-3> { enum { idx =13 }; };
-    template<> struct WenoPt< 0, 0,-2> { enum { idx =14 }; };
-    template<> struct WenoPt< 0, 0,-1> { enum { idx =15 }; };
-    template<> struct WenoPt< 0, 0, 1> { enum { idx =16 }; };
-    template<> struct WenoPt< 0, 0, 2> { enum { idx =17 }; };
-    template<> struct WenoPt< 0, 0, 3> { enum { idx =18 }; };
-
-}
-
-/// @brief This is a special 19-point stencil that supports optimal fifth-order WENO
-/// upwinding, second-order central differencing, Laplacian, and zero-crossing test.
-///
-/// @note For optimal random access performance this class
-/// includes its own grid accessor.
-template<typename GridT, typename RealT = typename GridT::ValueType>
-class WenoStencil: public BaseStencil<WenoStencil<GridT>, 19, GridT>
-{
-    using SelfT     = WenoStencil<GridT>;
-    using BaseType  = BaseStencil<SelfT, 19, GridT>;
-public:
-    using GridType  = GridT;
-    using TreeType  = typename GridT::TreeType;
-    using ValueType = typename GridT::ValueType;
-
-    static constexpr int SIZE = 19;
-
-    __hostdev__ WenoStencil(const GridType& grid)
-        : BaseType(grid)
-        , mDx2(ValueType(Pow2(grid.voxelSize()[0])))
-        , mInv2Dx(ValueType(0.5 / grid.voxelSize()[0]))
-        , mInvDx2(ValueType(1.0 / mDx2))
-    {
-    }
-
-    __hostdev__ WenoStencil(const GridType& grid, double dx)
-        : BaseType(grid)
-        , mDx2(ValueType(dx * dx))
-        , mInv2Dx(ValueType(0.5 / dx))
-        , mInvDx2(ValueType(1.0 / mDx2))
-    {
-    }
-
-    /// @brief Return the norm-square of the WENO upwind gradient (computed via
-    /// WENO upwinding and Godunov's scheme) at the previously buffered location.
-    ///
-    /// @note This method should not be called until the stencil
-    /// buffer has been populated via a call to moveTo(ijk).
-    __hostdev__ inline ValueType normSqGrad(ValueType isoValue = ValueType(0)) const
-    {
-        const ValueType* v = mValues;
-        const RealT
-            dP_xm = WENO5<RealT>(v[ 2]-v[ 1],v[ 3]-v[ 2],v[ 0]-v[ 3],v[ 4]-v[ 0],v[ 5]-v[ 4],mDx2),
-            dP_xp = WENO5<RealT>(v[ 6]-v[ 5],v[ 5]-v[ 4],v[ 4]-v[ 0],v[ 0]-v[ 3],v[ 3]-v[ 2],mDx2),
-            dP_ym = WENO5<RealT>(v[ 8]-v[ 7],v[ 9]-v[ 8],v[ 0]-v[ 9],v[10]-v[ 0],v[11]-v[10],mDx2),
-            dP_yp = WENO5<RealT>(v[12]-v[11],v[11]-v[10],v[10]-v[ 0],v[ 0]-v[ 9],v[ 9]-v[ 8],mDx2),
-            dP_zm = WENO5<RealT>(v[14]-v[13],v[15]-v[14],v[ 0]-v[15],v[16]-v[ 0],v[17]-v[16],mDx2),
-            dP_zp = WENO5<RealT>(v[18]-v[17],v[17]-v[16],v[16]-v[ 0],v[ 0]-v[15],v[15]-v[14],mDx2);
-        return mInvDx2*static_cast<ValueType>(
-            GodunovsNormSqrd(v[0]>isoValue, dP_xm, dP_xp, dP_ym, dP_yp, dP_zm, dP_zp));
-    }
-
-    /// Return the optimal fifth-order upwind gradient corresponding to the
-    /// direction V.
-    ///
-    /// @note This method should not be called until the stencil
-    /// buffer has been populated via a call to moveTo(ijk).
-    __hostdev__ inline Vec3<ValueType> gradient(const Vec3<ValueType>& V) const
-    {
-        const ValueType* v = mValues;
-        return 2*mInv2Dx * Vec3<ValueType>(
-            V[0]>0 ? WENO5<RealT>(v[ 2]-v[ 1],v[ 3]-v[ 2],v[ 0]-v[ 3], v[ 4]-v[ 0],v[ 5]-v[ 4],mDx2)
-                   : WENO5<RealT>(v[ 6]-v[ 5],v[ 5]-v[ 4],v[ 4]-v[ 0], v[ 0]-v[ 3],v[ 3]-v[ 2],mDx2),
-            V[1]>0 ? WENO5<RealT>(v[ 8]-v[ 7],v[ 9]-v[ 8],v[ 0]-v[ 9], v[10]-v[ 0],v[11]-v[10],mDx2)
-                   : WENO5<RealT>(v[12]-v[11],v[11]-v[10],v[10]-v[ 0], v[ 0]-v[ 9],v[ 9]-v[ 8],mDx2),
-            V[2]>0 ? WENO5<RealT>(v[14]-v[13],v[15]-v[14],v[ 0]-v[15], v[16]-v[ 0],v[17]-v[16],mDx2)
-                   : WENO5<RealT>(v[18]-v[17],v[17]-v[16],v[16]-v[ 0], v[ 0]-v[15],v[15]-v[14],mDx2));
-    }
-    /// Return the gradient computed at the previously buffered
-    /// location by second-order central differencing.
-    ///
-    /// @note This method should not be called until the stencil
-    /// buffer has been populated via a call to moveTo(ijk).
-    __hostdev__ inline Vec3<ValueType> gradient() const
-    {
-        return mInv2Dx * Vec3<ValueType>(mValues[ 4] - mValues[ 3],
-                                         mValues[10] - mValues[ 9],
-                                         mValues[16] - mValues[15]);
-    }
-
-    /// Return the Laplacian computed at the previously buffered
-    /// location by second-order central differencing.
-    ///
-    /// @note This method should not be called until the stencil
-    /// buffer has been populated via a call to moveTo(ijk).
-    __hostdev__ inline ValueType laplacian() const
-    {
-        return mInvDx2 * (
-            mValues[ 3] + mValues[ 4] +
-            mValues[ 9] + mValues[10] +
-            mValues[15] + mValues[16] - 6*mValues[0]);
-    }
-
-    /// Return @c true if the sign of the value at the center point of the stencil
-    /// differs from the sign of any of its six nearest neighbors
-    __hostdev__ inline bool zeroCrossing() const
-    {
-        const ValueType* v = mValues;
-        return (v[ 0]>0 ? (v[ 3]<0 || v[ 4]<0 || v[ 9]<0 || v[10]<0 || v[15]<0 || v[16]<0)
-                        : (v[ 3]>0 || v[ 4]>0 || v[ 9]>0 || v[10]>0 || v[15]>0 || v[16]>0));
-    }
-
-    /// Return linear offset for the specified stencil point relative to its center
-    template<int i, int j, int k>
-    __hostdev__ unsigned int pos() const { return WenoPt<i,j,k>::idx; }
-
-private:
-    __hostdev__ inline void init(const Coord& ijk)
-    {
-        mValues[ 1] = mAcc.getValue(ijk.offsetBy(-3,  0,  0));
-        mValues[ 2] = mAcc.getValue(ijk.offsetBy(-2,  0,  0));
-        mValues[ 3] = mAcc.getValue(ijk.offsetBy(-1,  0,  0));
-        mValues[ 4] = mAcc.getValue(ijk.offsetBy( 1,  0,  0));
-        mValues[ 5] = mAcc.getValue(ijk.offsetBy( 2,  0,  0));
-        mValues[ 6] = mAcc.getValue(ijk.offsetBy( 3,  0,  0));
-
-        mValues[ 7] = mAcc.getValue(ijk.offsetBy( 0, -3,  0));
-        mValues[ 8] = mAcc.getValue(ijk.offsetBy( 0, -2,  0));
-        mValues[ 9] = mAcc.getValue(ijk.offsetBy( 0, -1,  0));
-        mValues[10] = mAcc.getValue(ijk.offsetBy( 0,  1,  0));
-        mValues[11] = mAcc.getValue(ijk.offsetBy( 0,  2,  0));
-        mValues[12] = mAcc.getValue(ijk.offsetBy( 0,  3,  0));
-
-        mValues[13] = mAcc.getValue(ijk.offsetBy( 0,  0, -3));
-        mValues[14] = mAcc.getValue(ijk.offsetBy( 0,  0, -2));
-        mValues[15] = mAcc.getValue(ijk.offsetBy( 0,  0, -1));
-        mValues[16] = mAcc.getValue(ijk.offsetBy( 0,  0,  1));
-        mValues[17] = mAcc.getValue(ijk.offsetBy( 0,  0,  2));
-        mValues[18] = mAcc.getValue(ijk.offsetBy( 0,  0,  3));
-    }
-
-    template<typename, int, typename> friend class BaseStencil; // allow base class to call init()
-    using BaseType::mAcc;
-    using BaseType::mValues;
-    const ValueType mDx2, mInv2Dx, mInvDx2;
-}; // WenoStencil class
-
-
-// ---------------------------- CurvatureStencil ----------------------------
-
-namespace { // anonymous namespace for stencil-layout map
-
-    template<int i, int j, int k> struct CurvPt {};
-    template<> struct CurvPt< 0, 0, 0> { enum { idx = 0 }; };
-
-    template<> struct CurvPt<-1, 0, 0> { enum { idx = 1 }; };
-    template<> struct CurvPt< 1, 0, 0> { enum { idx = 2 }; };
-
-    template<> struct CurvPt< 0,-1, 0> { enum { idx = 3 }; };
-    template<> struct CurvPt< 0, 1, 0> { enum { idx = 4 }; };
-
-    template<> struct CurvPt< 0, 0,-1> { enum { idx = 5 }; };
-    template<> struct CurvPt< 0, 0, 1> { enum { idx = 6 }; };
-
-    template<> struct CurvPt<-1,-1, 0> { enum { idx = 7 }; };
-    template<> struct CurvPt< 1,-1, 0> { enum { idx = 8 }; };
-    template<> struct CurvPt<-1, 1, 0> { enum { idx = 9 }; };
-    template<> struct CurvPt< 1, 1, 0> { enum { idx =10 }; };
-
-    template<> struct CurvPt<-1, 0,-1> { enum { idx =11 }; };
-    template<> struct CurvPt< 1, 0,-1> { enum { idx =12 }; };
-    template<> struct CurvPt<-1, 0, 1> { enum { idx =13 }; };
-    template<> struct CurvPt< 1, 0, 1> { enum { idx =14 }; };
-
-    template<> struct CurvPt< 0,-1,-1> { enum { idx =15 }; };
-    template<> struct CurvPt< 0, 1,-1> { enum { idx =16 }; };
-    template<> struct CurvPt< 0,-1, 1> { enum { idx =17 }; };
-    template<> struct CurvPt< 0, 1, 1> { enum { idx =18 }; };
-
-}
-
-template<typename GridT, typename RealT = typename GridT::ValueType>
-class CurvatureStencil: public BaseStencil<CurvatureStencil<GridT>, 19, GridT>
-{
-    using SelfT     = CurvatureStencil<GridT>;
-    using BaseType  = BaseStencil<SelfT, 19, GridT>;
-public:
-    using GridType  = GridT;
-    using TreeType  = typename GridT::TreeType;
-    using ValueType = typename GridT::ValueType;
-
-    static constexpr int SIZE = 19;
-
-    __hostdev__ CurvatureStencil(const GridType& grid)
-        : BaseType(grid)
-        , mInv2Dx(ValueType(0.5 / grid.voxelSize()[0]))
-        , mInvDx2(ValueType(4.0 * mInv2Dx * mInv2Dx))
-    {
-    }
-
-    __hostdev__ CurvatureStencil(const GridType& grid, double dx)
-        : BaseType(grid)
-        , mInv2Dx(ValueType(0.5 / dx))
-        , mInvDx2(ValueType(4.0 * mInv2Dx * mInv2Dx))
-    {
-    }
-
-    /// @brief Return the mean curvature at the previously buffered location.
-    ///
-    /// @note This method should not be called until the stencil
-    /// buffer has been populated via a call to moveTo(ijk).
-    __hostdev__ inline ValueType meanCurvature() const
-    {
-        RealT alpha, normGrad;
-        return this->meanCurvature(alpha, normGrad) ?
-               ValueType(alpha*mInv2Dx/Pow3(normGrad)) : 0;
-    }
-
-    /// @brief Return the Gaussian curvature at the previously buffered location.
-    ///
-    /// @note This method should not be called until the stencil
-    /// buffer has been populated via a call to moveTo(ijk).
-    __hostdev__ inline ValueType gaussianCurvature() const
-    {
-        RealT alpha, normGrad;
-        return this->gaussianCurvature(alpha, normGrad) ?
-               ValueType(alpha*mInvDx2/Pow4(normGrad)) : 0;
-    }
-
-    /// @brief Return both the mean and the Gaussian curvature at the
-    ///        previously buffered location.
-    ///
-    /// @note This method should not be called until the stencil
-    /// buffer has been populated via a call to moveTo(ijk).
-    __hostdev__ inline void curvatures(ValueType &mean, ValueType& gauss) const
-    {
-        RealT alphaM, alphaG, normGrad;
-        if (this->curvatures(alphaM, alphaG, normGrad)) {
-          mean  = ValueType(alphaM*mInv2Dx/Pow3(normGrad));
-          gauss = ValueType(alphaG*mInvDx2/Pow4(normGrad));
-        } else {
-          mean = gauss = 0;
-        }
-    }
-
-    /// Return the mean curvature multiplied by the norm of the
-    /// central-difference gradient. This method is very useful for
-    /// mean-curvature flow of level sets!
-    ///
-    /// @note This method should not be called until the stencil
-    /// buffer has been populated via a call to moveTo(ijk).
-    __hostdev__ inline ValueType meanCurvatureNormGrad() const
-    {
-        RealT alpha, normGrad;
-        return this->meanCurvature(alpha, normGrad) ?
-               ValueType(alpha*mInvDx2/(2*Pow2(normGrad))) : 0;
-    }
-
-    /// Return the mean Gaussian multiplied by the norm of the
-    /// central-difference gradient.
-    ///
-    /// @note This method should not be called until the stencil
-    /// buffer has been populated via a call to moveTo(ijk).
-    __hostdev__ inline ValueType gaussianCurvatureNormGrad() const
-    {
-        RealT alpha, normGrad;
-        return this->gaussianCurvature(alpha, normGrad) ?
-               ValueType(2*alpha*mInv2Dx*mInvDx2/Pow3(normGrad)) : 0;
-    }
-
-    /// @brief Return both the mean and the Gaussian curvature at the
-    ///        previously buffered location.
-    ///
-    /// @note This method should not be called until the stencil
-    /// buffer has been populated via a call to moveTo(ijk).
-    __hostdev__ inline void curvaturesNormGrad(ValueType &mean, ValueType& gauss) const
-    {
-        RealT alphaM, alphaG, normGrad;
-        if (this->curvatures(alphaM, alphaG, normGrad)) {
-          mean  = ValueType(alphaM*mInvDx2/(2*Pow2(normGrad)));
-          gauss = ValueType(2*alphaG*mInv2Dx*mInvDx2/Pow3(normGrad));
-        } else {
-          mean = gauss = 0;
-        }
-    }
-
-    /// @brief Computes the minimum and maximum principal curvature at the
-    ///        previously buffered location.
-    ///
-    /// @note This method should not be called until the stencil
-    /// buffer has been populated via a call to moveTo(ijk).
-    __hostdev__ inline void principalCurvatures(ValueType &min, ValueType &max) const
-    {
-        min = max = 0;
-        RealT alphaM, alphaG, normGrad;
-        if (this->curvatures(alphaM, alphaG, normGrad)) {
-            const RealT mean = alphaM*mInv2Dx/Pow3(normGrad);
-            const RealT tmp = Sqrt(mean*mean - alphaG*mInvDx2/Pow4(normGrad));
-            min = ValueType(mean - tmp);
-            max = ValueType(mean + tmp);
-        }
-    }
-
-    /// Return the Laplacian computed at the previously buffered
-    /// location by second-order central differencing.
-    ///
-    /// @note This method should not be called until the stencil
-    /// buffer has been populated via a call to moveTo(ijk).
-    __hostdev__ inline ValueType laplacian() const
-    {
-        return mInvDx2 * (
-            mValues[1] + mValues[2] +
-            mValues[3] + mValues[4] +
-            mValues[5] + mValues[6] - 6*mValues[0]);
-    }
-
-    /// Return the gradient computed at the previously buffered
-    /// location by second-order central differencing.
-    ///
-    /// @note This method should not be called until the stencil
-    /// buffer has been populated via a call to moveTo(ijk).
-    __hostdev__ inline Vec3<ValueType> gradient() const
-    {
-        return Vec3<ValueType>(
-            mValues[2] - mValues[1],
-            mValues[4] - mValues[3],
-            mValues[6] - mValues[5])*mInv2Dx;
-    }
-
-    /// Return linear offset for the specified stencil point relative to its center
-    template<int i, int j, int k>
-    __hostdev__ unsigned int pos() const { return CurvPt<i,j,k>::idx; }
-
-private:
-    __hostdev__ inline void init(const Coord &ijk)
-    {
-        mValues[ 1] = mAcc.getValue(ijk.offsetBy(-1,  0,  0));
-        mValues[ 2] = mAcc.getValue(ijk.offsetBy( 1,  0,  0));
-
-        mValues[ 3] = mAcc.getValue(ijk.offsetBy( 0, -1,  0));
-        mValues[ 4] = mAcc.getValue(ijk.offsetBy( 0,  1,  0));
-
-        mValues[ 5] = mAcc.getValue(ijk.offsetBy( 0,  0, -1));
-        mValues[ 6] = mAcc.getValue(ijk.offsetBy( 0,  0,  1));
-
-        mValues[ 7] = mAcc.getValue(ijk.offsetBy(-1, -1,  0));
-        mValues[ 8] = mAcc.getValue(ijk.offsetBy( 1, -1,  0));
-        mValues[ 9] = mAcc.getValue(ijk.offsetBy(-1,  1,  0));
-        mValues[10] = mAcc.getValue(ijk.offsetBy( 1,  1,  0));
-
-        mValues[11] = mAcc.getValue(ijk.offsetBy(-1,  0, -1));
-        mValues[12] = mAcc.getValue(ijk.offsetBy( 1,  0, -1));
-        mValues[13] = mAcc.getValue(ijk.offsetBy(-1,  0,  1));
-        mValues[14] = mAcc.getValue(ijk.offsetBy( 1,  0,  1));
-
-        mValues[15] = mAcc.getValue(ijk.offsetBy( 0, -1, -1));
-        mValues[16] = mAcc.getValue(ijk.offsetBy( 0,  1, -1));
-        mValues[17] = mAcc.getValue(ijk.offsetBy( 0, -1,  1));
-        mValues[18] = mAcc.getValue(ijk.offsetBy( 0,  1,  1));
-    }
-
-    __hostdev__ inline RealT Dx()  const { return 0.5*(mValues[2] - mValues[1]); }// * 1/dx
-    __hostdev__ inline RealT Dy()  const { return 0.5*(mValues[4] - mValues[3]); }// * 1/dx
-    __hostdev__ inline RealT Dz()  const { return 0.5*(mValues[6] - mValues[5]); }// * 1/dx
-    __hostdev__ inline RealT Dxx() const { return mValues[2] - 2 * mValues[0] + mValues[1]; }// * 1/dx2
-    __hostdev__ inline RealT Dyy() const { return mValues[4] - 2 * mValues[0] + mValues[3]; }// * 1/dx2}
-    __hostdev__ inline RealT Dzz() const { return mValues[6] - 2 * mValues[0] + mValues[5]; }// * 1/dx2
-    __hostdev__ inline RealT Dxy() const { return 0.25 * (mValues[10] - mValues[ 8] + mValues[ 7] - mValues[ 9]); }// * 1/dx2
-    __hostdev__ inline RealT Dxz() const { return 0.25 * (mValues[14] - mValues[12] + mValues[11] - mValues[13]); }// * 1/dx2
-    __hostdev__ inline RealT Dyz() const { return 0.25 * (mValues[18] - mValues[16] + mValues[15] - mValues[17]); }// * 1/dx2
-
-    __hostdev__ inline bool meanCurvature(RealT& alpha, RealT& normGrad) const
-    {
-        // For performance all finite differences are unscaled wrt dx
-        const RealT Dx  = this->Dx(), Dy = this->Dy(), Dz = this->Dz(),
-                    Dx2 = Dx*Dx, Dy2 = Dy*Dy, Dz2 = Dz*Dz, normGrad2 = Dx2 + Dy2 + Dz2;
-        if (normGrad2 <= Tolerance<RealT>::value()) {
-             alpha = normGrad = 0;
-             return false;
-        }
-        const RealT Dxx = this->Dxx(), Dyy = this->Dyy(), Dzz = this->Dzz();
-        alpha = Dx2*(Dyy + Dzz) + Dy2*(Dxx + Dzz) + Dz2*(Dxx + Dyy) -
-                2*(Dx*(Dy*this->Dxy() + Dz*this->Dxz()) + Dy*Dz*this->Dyz());// * 1/dx^4
-        normGrad = Sqrt(normGrad2); // * 1/dx
-        return true;
-    }
-
-    __hostdev__ inline bool gaussianCurvature(RealT& alpha, RealT& normGrad) const
-    {
-        // For performance all finite differences are unscaled wrt dx
-        const RealT Dx  = this->Dx(), Dy = this->Dy(), Dz = this->Dz(),
-                    Dx2 = Dx*Dx, Dy2 = Dy*Dy, Dz2 = Dz*Dz, normGrad2 = Dx2 + Dy2 + Dz2;
-        if (normGrad2 <= Tolerance<RealT>::value()) {
-             alpha = normGrad = 0;
-             return false;
-        }
-        const RealT Dxx = this->Dxx(), Dyy = this->Dyy(), Dzz = this->Dzz(),
-                   Dxy = this->Dxy(), Dxz = this->Dxz(), Dyz = this->Dyz();
-        alpha = Dx2*(Dyy*Dzz - Dyz*Dyz) + Dy2*(Dxx*Dzz - Dxz*Dxz) + Dz2*(Dxx*Dyy - Dxy*Dxy) +
-                2*( Dy*Dz*(Dxy*Dxz - Dyz*Dxx) + Dx*Dz*(Dxy*Dyz - Dxz*Dyy) + Dx*Dy*(Dxz*Dyz - Dxy*Dzz) );// * 1/dx^6
-        normGrad  = Sqrt(normGrad2); // * 1/dx
-        return true;
-    }
-
-    __hostdev__ inline bool curvatures(RealT& alphaM, RealT& alphaG, RealT& normGrad) const
-    {
-        // For performance all finite differences are unscaled wrt dx
-        const RealT Dx  = this->Dx(), Dy = this->Dy(), Dz = this->Dz(),
-                    Dx2 = Dx*Dx, Dy2 = Dy*Dy, Dz2 = Dz*Dz, normGrad2 = Dx2 + Dy2 + Dz2;
-        if (normGrad2 <= Tolerance<RealT>::value()) {
-             alphaM = alphaG =normGrad = 0;
-             return false;
-        }
-        const RealT Dxx = this->Dxx(), Dyy = this->Dyy(), Dzz = this->Dzz(),
-                    Dxy = this->Dxy(), Dxz = this->Dxz(), Dyz = this->Dyz();
-        alphaM = Dx2*(Dyy + Dzz) + Dy2*(Dxx + Dzz) + Dz2*(Dxx + Dyy) -
-                 2*(Dx*(Dy*Dxy + Dz*Dxz) + Dy*Dz*Dyz);// *1/dx^4
-        alphaG = Dx2*(Dyy*Dzz - Dyz*Dyz) + Dy2*(Dxx*Dzz - Dxz*Dxz) + Dz2*(Dxx*Dyy - Dxy*Dxy) +
-                 2*( Dy*Dz*(Dxy*Dxz - Dyz*Dxx) + Dx*Dz*(Dxy*Dyz - Dxz*Dyy) + Dx*Dy*(Dxz*Dyz - Dxy*Dzz) );// *1/dx^6
-        normGrad  = Sqrt(normGrad2); // * 1/dx
-        return true;
-    }
-
-    template<typename, int, typename> friend class BaseStencil; // allow base class to call init()
-    using BaseType::mAcc;
-    using BaseType::mValues;
-    const ValueType mInv2Dx, mInvDx2;
-}; // CurvatureStencil class
-
-} // end nanovdb namespace
-
-#endif // NANOVDB_STENCILS_HAS_BEEN_INCLUDED
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/math/Stencils.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/math/Stencils.h instead.")
diff --git a/nanovdb/nanovdb/util/Timer.h b/nanovdb/nanovdb/util/Timer.h
new file mode 100644
index 0000000000..992b055b4c
--- /dev/null
+++ b/nanovdb/nanovdb/util/Timer.h
@@ -0,0 +1,87 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/// @file nanovdb/util/Timer.h
+///
+/// @author Ken Museth
+///
+/// @brief A simple timing class (in case openvdb::util::CpuTimer is unavailable)
+
+#ifndef NANOVDB_UTIL_TIMER_H_HAS_BEEN_INCLUDED
+#define NANOVDB_UTIL_TIMER_H_HAS_BEEN_INCLUDED
+
+#include <iostream>
+#include <chrono>
+
+namespace nanovdb {
+
+namespace util {
+
+class Timer
+{
+    std::chrono::high_resolution_clock::time_point mStart;
+public:
+    /// @brief Default constructor
+    Timer() {}
+
+    /// @brief Constructor that starts the timer
+    /// @param msg string message to be printed when timer is started
+    /// @param os output stream for the message above
+    Timer(const std::string &msg, std::ostream& os = std::cerr) {this->start(msg, os);}
+
+    /// @brief Start the timer
+    /// @param msg string message to be printed when timer is started
+    /// @param os output stream for the message above
+    void start(const std::string &msg, std::ostream& os = std::cerr)
+    {
+        os << msg << " ... " << std::flush;
+        mStart = std::chrono::high_resolution_clock::now();
+    }
+
+    /// @brief elapsed time (since start) in miliseconds
+    template <typename AccuracyT = std::chrono::milliseconds>
+    auto elapsed()
+    {
+        auto end = std::chrono::high_resolution_clock::now();
+        return std::chrono::duration_cast<AccuracyT>(end - mStart).count();
+    }
+
+    /// @brief stop the timer
+    /// @tparam AccuracyT Template parameter defining the accuracy of the reported times
+    /// @param os output stream for the message above
+    template <typename AccuracyT = std::chrono::milliseconds>
+    void stop(std::ostream& os = std::cerr)
+    {
+        auto end = std::chrono::high_resolution_clock::now();
+        auto diff = std::chrono::duration_cast<AccuracyT>(end - mStart).count();
+        os << "completed in " << diff;
+        if (std::is_same<AccuracyT, std::chrono::microseconds>::value) {// resolved at compile-time
+            os << " microseconds" << std::endl;
+        } else if (std::is_same<AccuracyT, std::chrono::milliseconds>::value) {
+            os << " milliseconds" << std::endl;
+        } else if (std::is_same<AccuracyT, std::chrono::seconds>::value) {
+            os << " seconds" << std::endl;
+        } else {
+            os << " unknown time unit" << std::endl;
+        }
+    }
+
+    /// @brief stop and start the timer
+    /// @tparam AccuracyT Template parameter defining the accuracy of the reported times
+    /// @param msg string message to be printed when timer is started
+    /// @param os output stream for the message above
+    template <typename AccuracyT = std::chrono::milliseconds>
+    void restart(const std::string &msg, std::ostream& os = std::cerr)
+    {
+        this->stop<AccuracyT>();
+        this->start(msg, os);
+    }
+};// Timer
+
+}// namespace util
+
+using CpuTimer [[deprecated("Use nanovdb::util::Timer instead")]] = util::Timer;
+
+} // namespace nanovdb
+
+#endif // NANOVDB_UTIL_TIMER_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/util/Util.h b/nanovdb/nanovdb/util/Util.h
new file mode 100644
index 0000000000..e8ebfc1c63
--- /dev/null
+++ b/nanovdb/nanovdb/util/Util.h
@@ -0,0 +1,657 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    \file   nanovdb/util/Util.h
+
+    \author Ken Museth
+
+    \date  January 8, 2020
+
+    \brief Utility functions
+*/
+
+#ifndef NANOVDB_UTIL_UTIL_H_HAS_BEEN_INCLUDED
+#define NANOVDB_UTIL_UTIL_H_HAS_BEEN_INCLUDED
+
+#ifdef __CUDACC_RTC__
+
+typedef signed char        int8_t;
+typedef short              int16_t;
+typedef int                int32_t;
+typedef long long          int64_t;
+typedef unsigned char      uint8_t;
+typedef unsigned int       uint32_t;
+typedef unsigned short     uint16_t;
+typedef unsigned long long uint64_t;
+
+#define NANOVDB_ASSERT(x)
+
+#ifndef UINT64_C
+#define UINT64_C(x) (x ## ULL)
+#endif
+
+#else // !__CUDACC_RTC__
+
+#include <stdlib.h> //    for abs in clang7
+#include <stdint.h> //    for types like int32_t etc
+#include <stddef.h> //    for size_t type
+#include <cassert> //     for assert
+#include <cstdio> //      for stderr and snprintf
+#include <cmath> //       for sqrt and fma
+#include <limits> //      for numeric_limits
+#include <utility>//      for std::move
+#ifdef NANOVDB_USE_IOSTREAMS
+#include <fstream>//      for read/writeUncompressedGrids
+#endif// ifdef NANOVDB_USE_IOSTREAMS
+
+// All asserts can be disabled here, even for debug builds
+#if 1
+#define NANOVDB_ASSERT(x) assert(x)
+#else
+#define NANOVDB_ASSERT(x)
+#endif
+
+#if defined(NANOVDB_USE_INTRINSICS) && defined(_MSC_VER)
+#include <intrin.h>
+#pragma intrinsic(_BitScanReverse)
+#pragma intrinsic(_BitScanForward)
+#pragma intrinsic(_BitScanReverse64)
+#pragma intrinsic(_BitScanForward64)
+#endif
+
+#endif // __CUDACC_RTC__
+
+#if defined(__CUDACC__) || defined(__HIP__)
+// Only define __hostdev__ qualifier when using NVIDIA CUDA or HIP compilers
+#ifndef __hostdev__
+#define __hostdev__ __host__ __device__ // Runs on the CPU and GPU, called from the CPU or the GPU
+#endif
+#else
+// Dummy definitions of macros only defined by CUDA and HIP compilers
+#ifndef __hostdev__
+#define __hostdev__ // Runs on the CPU and GPU, called from the CPU or the GPU
+#endif
+#ifndef __global__
+#define __global__ // Runs on the GPU, called from the CPU or the GPU
+#endif
+#ifndef __device__
+#define __device__ // Runs on the GPU, called from the GPU
+#endif
+#ifndef __host__
+#define __host__ // Runs on the CPU, called from the CPU
+#endif
+
+#endif // if defined(__CUDACC__) || defined(__HIP__)
+
+// The following macro will suppress annoying warnings when nvcc
+// compiles functions that call (host) intrinsics (which is perfectly valid)
+#if defined(_MSC_VER) && defined(__CUDACC__)
+#define NANOVDB_HOSTDEV_DISABLE_WARNING __pragma("hd_warning_disable")
+#elif defined(__GNUC__) && defined(__CUDACC__)
+#define NANOVDB_HOSTDEV_DISABLE_WARNING _Pragma("hd_warning_disable")
+#else
+#define NANOVDB_HOSTDEV_DISABLE_WARNING
+#endif
+
+// Define compiler warnings that work with all compilers
+//#if defined(_MSC_VER)
+//#define NANO_WARNING(msg) _pragma("message" #msg)
+//#else
+//#define NANO_WARNING(msg) _Pragma("message" #msg)
+//#endif
+
+//==============================================
+/// @brief Defines macros that issues warnings for deprecated header files
+/// @details Example:
+/// @code
+/// #include <nanovdb/util/Util.h> // for NANOVDB_DEPRECATED_HEADER
+/// #include <nanovdb/path/Alternative.h>
+/// NANOVDB_DEPRECATED_HEADER("This header file is deprecated, please use <nanovdb/path/Alternative.h> instead")
+/// @endcode
+#ifdef __GNUC__
+#define NANOVDB_PRAGMA(X) _Pragma(#X)
+#define NANOVDB_DEPRECATED_HEADER(MSG) NANOVDB_PRAGMA(GCC warning MSG)
+#elif defined(_MSC_VER)
+#define NANOVDB_STRINGIZE_(MSG) #MSG
+#define NANOVDB_STRINGIZE(MSG) NANOVDB_STRINGIZE_(MSG)
+#define NANOVDB_DEPRECATED_HEADER(MSG) \
+    __pragma(message(__FILE__ "(" NANOVDB_STRINGIZE(__LINE__) ") : Warning: " MSG))
+#endif
+
+// A portable implementation of offsetof - unfortunately it doesn't work with static_assert
+#define NANOVDB_OFFSETOF(CLASS, MEMBER) ((int)(size_t)((char*)&((CLASS*)0)->MEMBER - (char*)0))
+
+namespace nanovdb {// =================================================================
+
+namespace util {// ====================================================================
+
+/// @brief Minimal implementation of std::declval, which converts any type @c T to
+////       a reference type, making it possible to use member functions in the operand
+///        of the decltype specifier without the need to go through constructors.
+/// @tparam T Template type to be converted to T&&
+/// @return T&&
+/// @warning Unlike std::declval, this version does not work when T = void! However,
+///          NVRTC does not like std::declval, so we provide our own implementation.
+template<typename T>
+T&& declval() noexcept;
+
+// --------------------------> string utility functions <------------------------------------
+
+/// @brief tests if a c-string @c str is empty, that is its first value is '\0'
+/// @param str c-string to be tested for null termination
+/// @return true if str[0] = '\0'
+__hostdev__ inline bool empty(const char* str)
+{
+    NANOVDB_ASSERT(str != nullptr);
+    return *str == '\0';
+}// util::empty
+
+/// @brief length of a c-sting, excluding '\0'.
+/// @param str c-string
+/// @return the number of characters that precede the terminating null character.
+__hostdev__ inline size_t strlen(const char *str)
+{
+    NANOVDB_ASSERT(str != nullptr);
+    const char *s = str;
+    while(*s) ++s;               ;
+    return (s - str);
+}// util::strlen
+
+/// @brief Copy characters from @c src to @c dst.
+/// @param dst pointer to the destination string.
+/// @param src pointer to the null-terminated source string.
+/// @return destination string @c dst.
+/// @note Emulates the behaviour of std::strcpy, except this version also runs on the GPU.
+__hostdev__ inline char* strcpy(char *dst, const char *src)
+{
+    NANOVDB_ASSERT(dst != nullptr && src != nullptr);
+    for (char *p = dst; (*p++ = *src) != '\0'; ++src);
+    return dst;
+}// util::strcpy(char*, const char*)
+
+/// @brief Copies the first num characters of @c src to @c dst.
+///        If the end of the source C string (which is signaled by a
+///        null-character) is found before @c max characters have been
+///        copied, @c dst is padded with zeros until a total of @c max
+///        characters have been written to it.
+/// @param dst destination string
+/// @param src source string
+/// @param max maximum number of character in destination string
+/// @return destination string @c dst
+/// @warning if strncpy(dst, src, max)[max-1]!='\0' then @c src has more
+///          characters than @c max and the return string needs to be
+///          manually null-terminated, i.e. strncpy(dst, src, max)[max-1]='\0'
+__hostdev__ inline char* strncpy(char *dst, const char *src, size_t max)
+{
+    NANOVDB_ASSERT(dst != nullptr && src != nullptr);
+    size_t i = 0;
+    for (; i < max && src[i] != '\0'; ++i) dst[i] = src[i];
+    for (; i < max; ++i) dst[i] = '\0';
+    return dst;
+}// util::strncpy(char *dst, const char *src, size_t max)
+
+/// @brief converts a number to a string using a specific base
+/// @param dst destination string
+/// @param num signed number to be concatenated after @c dst
+/// @param bas base used when converting @c num to a string
+/// @return destination string @c dst
+/// @note Emulates the behaviour of itoa, except this verion also works on the GPU.
+__hostdev__ inline char* strcpy(char* dst, int num, int bas = 10)
+{
+    NANOVDB_ASSERT(dst != nullptr && bas > 0);
+    int len = 0;// length of number once converted to a string
+    if (num == 0) dst[len++] = '0';
+    for (int abs = num < 0 && bas == 10 ? -num : num; abs; abs /= bas) {
+        const int rem = abs % bas;
+        dst[len++] = rem > 9 ? rem - 10 + 'a' : rem + '0';
+    }
+    if (num < 0) dst[len++] = '-';// append '-' if negative
+    for (char *a = dst, *b = a + len - 1; a < b; ++a, --b) {// reverse dst
+        dst[len] = *a;// use end of string as temp
+        *a = *b;
+        *b = dst[len];
+    }
+    dst[len] = '\0';// explicitly terminate end of string
+    return dst;
+}// util::strcpy(char*, int, int)
+
+/// @brief Appends a copy of the character string pointed to by @c src to
+///        the end of the character string pointed to by @c dst on the device.
+/// @param dst pointer to the null-terminated byte string to append to.
+/// @param src pointer to the null-terminated byte string to copy from.
+/// @return pointer to the character array being appended to.
+/// @note Emulates the behaviour of std::strcat, except this version also runs on the GPU.
+__hostdev__ inline char* strcat(char *dst, const char *src)
+{
+    NANOVDB_ASSERT(dst != nullptr && src != nullptr);
+    char *p = dst;
+    while (*p != '\0') ++p;// advance till end of dst
+    strcpy(p, src);// append src
+    return dst;
+}// util::strcat(char*, const char*)
+
+/// @brief concatenates a number after a string using a specific base
+/// @param dst null terminated destination string
+/// @param num signed number to be concatenated after @c dst
+/// @param bas base used when converting @c num to a string
+/// @return destination string @c dst
+__hostdev__ inline char* strcat(char* dst, int num, int bas = 10)
+{
+    NANOVDB_ASSERT(dst != nullptr);
+    char *p = dst;
+    while (*p != '\0') ++p;
+    strcpy(p, num, bas);
+    return dst;
+}// util::strcat(char*, int, int)
+
+/// @brief Compares two null-terminated byte strings lexicographically.
+/// @param lhs pointer to the null-terminated byte strings to compare
+/// @param rhs pointer to the null-terminated byte strings to compare
+/// @return Negative value if @c lhs appears before @c rhs in lexicographical order.
+///         Zero if @c lhs and @c rhs compare equal. Positive value if @c lhs appears
+///         after @c rhs in lexicographical order.
+/// @note Emulates the behaviour of std::strcmp, except this version also runs on the GPU.
+__hostdev__ inline int strcmp(const char *lhs, const char *rhs)
+{
+    while(*lhs != '\0' && (*lhs == *rhs)){
+        lhs++;
+        rhs++;
+    }
+    return *(const unsigned char*)lhs - *(const unsigned char*)rhs;// zero if lhs == rhs
+}// util::strcmp(const char*, const char*)
+
+/// @brief Test if two null-terminated byte strings are the same
+/// @param lhs pointer to the null-terminated byte strings to compare
+/// @param rhs pointer to the null-terminated byte strings to compare
+/// @return true if the two c-strings are identical
+__hostdev__ inline bool streq(const char *lhs, const char *rhs)
+{
+    return strcmp(lhs, rhs) == 0;
+}// util::streq
+
+namespace impl {// =======================================================
+// Base-case implementation of Variadic Template function impl::sprint
+__hostdev__ inline char* sprint(char *dst){return dst;}
+// Variadic Template function impl::sprint
+template <typename T, typename... Types>
+__hostdev__ inline char* sprint(char *dst, T var1, Types... var2)
+{
+    return impl::sprint(strcat(dst, var1), var2...);
+}
+}// namespace impl =========================================================
+
+/// @brief prints a variable number of string and/or numbers to a destination string
+template <typename T, typename... Types>
+__hostdev__ inline char* sprint(char *dst, T var1, Types... var2)
+{
+    return impl::sprint(strcpy(dst, var1), var2...);
+}// util::sprint
+
+// --------------------------> memzero <------------------------------------
+
+/// @brief Zero initialization of memory
+/// @param dst pointer to destination
+/// @param byteCount number of bytes to be initialized to zero
+/// @return destination pointer @c dst
+__hostdev__ inline static void* memzero(void *dst, size_t byteCount)
+{
+    NANOVDB_ASSERT(dst);
+    const size_t wordCount = byteCount >> 3;
+    if (wordCount << 3 == byteCount) {
+        for (auto *d = (uint64_t*)dst, *e = d + wordCount; d != e; ++d) *d = 0ULL;
+    } else {
+        for (auto *d = (char*)dst, *e = d + byteCount; d != e; ++d) *d = '\0';
+    }
+    return dst;
+}// util::memzero
+
+// --------------------------> util::is_same <------------------------------------
+
+/// @brief C++11 implementation of std::is_same
+/// @note When more than two arguments are provided value = T0==T1 || T0==T2 || ...
+template<typename T0, typename T1, typename ...T>
+struct is_same
+{
+    static constexpr bool value = is_same<T0, T1>::value || is_same<T0, T...>::value;
+};
+
+template<typename T0, typename T1>
+struct is_same<T0, T1> {static constexpr bool value = false;};
+
+template<typename T>
+struct is_same<T, T> {static constexpr bool value = true;};
+
+// --------------------------> util::is_floating_point <------------------------------------
+
+/// @brief C++11 implementation of std::is_floating_point
+template<typename T>
+struct is_floating_point {static constexpr bool value = is_same<T, float, double>::value;};
+
+// --------------------------> util::enable_if <------------------------------------
+
+/// @brief C++11 implementation of std::enable_if
+template <bool, typename T = void>
+struct enable_if {};
+
+template <typename T>
+struct enable_if<true, T> {using type = T;};
+
+// --------------------------> util::disable_if <------------------------------------
+
+template<bool, typename T = void>
+struct disable_if {using type = T;};
+
+template<typename T>
+struct disable_if<true, T> {};
+
+// --------------------------> util::is_const <------------------------------------
+
+template<typename T>
+struct is_const {static constexpr bool value = false;};
+
+template<typename T>
+struct is_const<const T> {static constexpr bool value = true;};
+
+// --------------------------> util::is_pointer <------------------------------------
+
+/// @brief Trait used to identify template parameter that are pointers
+/// @tparam T Template parameter to be tested
+template<class T>
+struct is_pointer {static constexpr bool value = false;};
+
+/// @brief Template specialization of pointers
+/// @tparam T Template parameter to be tested
+/// @note T can be both a non-const and const type
+template<class T>
+struct is_pointer<T*> {static constexpr bool value = true;};
+
+// --------------------------> util::conditional <------------------------------------
+
+/// @brief C++11 implementation of std::conditional
+template<bool, class TrueT, class FalseT>
+struct conditional { using type = TrueT; };
+
+/// @brief Template specialization of conditional
+/// @tparam FalseT Type used when boolean is false
+/// @tparam TrueT Type used when boolean is true
+template<class TrueT, class FalseT>
+struct conditional<false, TrueT, FalseT> { using type = FalseT; };
+
+// --------------------------> util::remove_const <------------------------------------
+
+/// @brief Trait use to const from type. Default implementation is just a pass-through
+/// @tparam T Type
+/// @details remove_pointer<float>::type = float
+template<typename T>
+struct remove_const {using type = T;};
+
+/// @brief Template specialization of trait class use to remove const qualifier type from a type
+/// @tparam T Type of the const type
+/// @details remove_pointer<const float>::type = float
+template<typename T>
+struct remove_const<const T> {using type = T;};
+
+// --------------------------> util::remove_reference <------------------------------------
+
+/// @brief Trait use to remove reference, i.e. "&", qualifier from a type. Default implementation is just a pass-through
+/// @tparam T Type
+/// @details remove_pointer<float>::type = float
+template <typename T>
+struct remove_reference {using type = T;};
+
+/// @brief Template specialization of trait class use to remove reference, i.e. "&", qualifier from a type
+/// @tparam T Type of the reference
+/// @details remove_pointer<float&>::type = float
+template <typename T>
+struct remove_reference<T&> {using type = T;};
+
+// --------------------------> util::remove_pointer <------------------------------------
+
+/// @brief Trait use to remove pointer, i.e. "*", qualifier from a type. Default implementation is just a pass-through
+/// @tparam T Type
+/// @details remove_pointer<float>::type = float
+template <typename T>
+struct remove_pointer {using type = T;};
+
+/// @brief Template specialization of trait class use to to remove pointer, i.e. "*", qualifier from a type
+/// @tparam T Type of the pointer
+/// @details remove_pointer<float*>::type = float
+template <typename T>
+struct remove_pointer<T*> {using type = T;};
+
+// --------------------------> util::match_const <------------------------------------
+
+/// @brief Trait used to transfer the const-ness of a reference type to another type
+/// @tparam T Type whose const-ness needs to match the reference type
+/// @tparam ReferenceT Reference type that is not const
+/// @details match_const<const int, float>::type = int
+///          match_const<int, float>::type = int
+template<typename T, typename ReferenceT>
+struct match_const {using type = typename remove_const<T>::type;};
+
+/// @brief Template specialization used to transfer the const-ness of a reference type to another type
+/// @tparam T Type that will adopt the const-ness of the reference type
+/// @tparam ReferenceT Reference type that is const
+/// @details match_const<const int, const float>::type = const int
+///          match_const<int, const float>::type = const int
+template<typename T, typename ReferenceT>
+struct match_const<T, const ReferenceT> {using type = const typename remove_const<T>::type;};
+
+// --------------------------> util::is_specialization <------------------------------------
+
+/// @brief Metafunction used to determine if the first template
+///        parameter is a specialization of the class template
+///        given in the second template parameter.
+///
+/// @details is_specialization<Vec3<float>, Vec3>::value == true;
+///          is_specialization<Vec3f, Vec3>::value == true;
+///          is_specialization<std::vector<float>, std::vector>::value == true;
+template<typename AnyType, template<typename...> class TemplateType>
+struct is_specialization {static const bool value = false;};
+template<typename... Args, template<typename...> class TemplateType>
+struct is_specialization<TemplateType<Args...>, TemplateType>
+{
+    static const bool value = true;
+};// util::is_specialization
+
+// --------------------------> util::PtrDiff <------------------------------------
+
+/// @brief Compute the distance, in bytes, between two pointers, dist = p - q
+/// @param p fist pointer, assumed to NOT be NULL
+/// @param q second pointer, assumed to NOT be NULL
+/// @return signed distance between pointer, p - q, addresses in units of bytes
+__hostdev__ inline static int64_t PtrDiff(const void* p, const void* q)
+{
+    NANOVDB_ASSERT(p && q);
+    return reinterpret_cast<const char*>(p) - reinterpret_cast<const char*>(q);
+}// util::PtrDiff
+
+// --------------------------> util::PtrAdd <------------------------------------
+
+/// @brief Adds a byte offset to a non-const pointer to produce another non-const pointer
+/// @tparam DstT Type of the return pointer (defaults to void)
+/// @param p non-const input pointer, assumed to NOT be NULL
+/// @param offset signed byte offset
+/// @return a non-const pointer defined as the offset of an input pointer
+template<typename DstT = void>
+__hostdev__ inline static DstT* PtrAdd(void* p, int64_t offset)
+{
+    NANOVDB_ASSERT(p);
+    return reinterpret_cast<DstT*>(reinterpret_cast<char*>(p) + offset);
+}// util::PtrAdd
+
+/// @brief Adds a byte offset to a const pointer to produce another const pointer
+/// @tparam DstT Type of the return pointer (defaults to void)
+/// @param p const input pointer, assumed to NOT be NULL
+/// @param offset signed byte offset
+/// @return a const pointer defined as the offset of a const input pointer
+template<typename DstT = void>
+__hostdev__ inline static const DstT* PtrAdd(const void* p, int64_t offset)
+{
+    NANOVDB_ASSERT(p);
+    return reinterpret_cast<const DstT*>(reinterpret_cast<const char*>(p) + offset);
+}// util::PtrAdd
+
+// -------------------> findLowestOn <----------------------------
+
+/// @brief Returns the index of the lowest, i.e. least significant, on bit in the specified 32 bit word
+///
+/// @warning Assumes that at least one bit is set in the word, i.e. @a v != uint32_t(0)!
+NANOVDB_HOSTDEV_DISABLE_WARNING
+__hostdev__ inline uint32_t findLowestOn(uint32_t v)
+{
+    NANOVDB_ASSERT(v);
+#if (defined(__CUDA_ARCH__) || defined(__HIP__)) && defined(NANOVDB_USE_INTRINSICS)
+    return __ffs(v) - 1; // one based indexing
+#elif defined(_MSC_VER) && defined(NANOVDB_USE_INTRINSICS)
+    unsigned long index;
+    _BitScanForward(&index, v);
+    return static_cast<uint32_t>(index);
+#elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS)
+    return static_cast<uint32_t>(__builtin_ctzl(v));
+#else
+    //NANO_WARNING("Using software implementation for findLowestOn(uint32_t v)")
+    static const unsigned char DeBruijn[32] = {
+        0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9};
+// disable unary minus on unsigned warning
+#if defined(_MSC_VER) && !defined(__NVCC__)
+#pragma warning(push)
+#pragma warning(disable : 4146)
+#endif
+    return DeBruijn[uint32_t((v & -v) * 0x077CB531U) >> 27];
+#if defined(_MSC_VER) && !defined(__NVCC__)
+#pragma warning(pop)
+#endif
+
+#endif
+}// util::findLowestOn(uint32_t)
+
+/// @brief Returns the index of the lowest, i.e. least significant, on bit in the specified 64 bit word
+///
+/// @warning Assumes that at least one bit is set in the word, i.e. @a v != uint32_t(0)!
+NANOVDB_HOSTDEV_DISABLE_WARNING
+__hostdev__ inline uint32_t findLowestOn(uint64_t v)
+{
+    NANOVDB_ASSERT(v);
+#if (defined(__CUDA_ARCH__) || defined(__HIP__)) && defined(NANOVDB_USE_INTRINSICS)
+    return __ffsll(static_cast<unsigned long long int>(v)) - 1; // one based indexing
+#elif defined(_MSC_VER) && defined(NANOVDB_USE_INTRINSICS)
+    unsigned long index;
+    _BitScanForward64(&index, v);
+    return static_cast<uint32_t>(index);
+#elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS)
+    return static_cast<uint32_t>(__builtin_ctzll(v));
+#else
+    //NANO_WARNING("Using software implementation for util::findLowestOn(uint64_t)")
+    static const unsigned char DeBruijn[64] = {
+        0,   1,  2, 53,  3,  7, 54, 27, 4,  38, 41,  8, 34, 55, 48, 28,
+        62,  5, 39, 46, 44, 42, 22,  9, 24, 35, 59, 56, 49, 18, 29, 11,
+        63, 52,  6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10,
+        51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12,
+    };
+// disable unary minus on unsigned warning
+#if defined(_MSC_VER) && !defined(__NVCC__)
+#pragma warning(push)
+#pragma warning(disable : 4146)
+#endif
+    return DeBruijn[uint64_t((v & -v) * UINT64_C(0x022FDD63CC95386D)) >> 58];
+#if defined(_MSC_VER) && !defined(__NVCC__)
+#pragma warning(pop)
+#endif
+
+#endif
+}// util::findLowestOn(uint64_t)
+
+// -------------------> findHighestOn <----------------------------
+
+/// @brief Returns the index of the highest, i.e. most significant, on bit in the specified 32 bit word
+///
+/// @warning Assumes that at least one bit is set in the word, i.e. @a v != uint32_t(0)!
+NANOVDB_HOSTDEV_DISABLE_WARNING
+__hostdev__ inline uint32_t findHighestOn(uint32_t v)
+{
+    NANOVDB_ASSERT(v);
+#if (defined(__CUDA_ARCH__) || defined(__HIP__)) && defined(NANOVDB_USE_INTRINSICS)
+    return sizeof(uint32_t) * 8 - 1 - __clz(v); // Return the number of consecutive high-order zero bits in a 32-bit integer.
+#elif defined(_MSC_VER) && defined(NANOVDB_USE_INTRINSICS)
+    unsigned long index;
+    _BitScanReverse(&index, v);
+    return static_cast<uint32_t>(index);
+#elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS)
+    return sizeof(unsigned long) * 8 - 1 - __builtin_clzl(v);
+#else
+    //NANO_WARNING("Using software implementation for util::findHighestOn(uint32_t)")
+    static const unsigned char DeBruijn[32] = {
+        0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
+        8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31};
+    v |= v >> 1; // first round down to one less than a power of 2
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    return DeBruijn[uint32_t(v * 0x07C4ACDDU) >> 27];
+#endif
+}// util::findHighestOn
+
+/// @brief Returns the index of the highest, i.e. most significant, on bit in the specified 64 bit word
+///
+/// @warning Assumes that at least one bit is set in the word, i.e. @a v != uint32_t(0)!
+NANOVDB_HOSTDEV_DISABLE_WARNING
+__hostdev__ inline uint32_t findHighestOn(uint64_t v)
+{
+    NANOVDB_ASSERT(v);
+#if (defined(__CUDA_ARCH__) || defined(__HIP__)) && defined(NANOVDB_USE_INTRINSICS)
+    return sizeof(unsigned long) * 8 - 1 - __clzll(static_cast<unsigned long long int>(v));
+#elif defined(_MSC_VER) && defined(NANOVDB_USE_INTRINSICS)
+    unsigned long index;
+    _BitScanReverse64(&index, v);
+    return static_cast<uint32_t>(index);
+#elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS)
+    return sizeof(unsigned long) * 8 - 1 - __builtin_clzll(v);
+#else
+    const uint32_t* p = reinterpret_cast<const uint32_t*>(&v);
+    return p[1] ? 32u + findHighestOn(p[1]) : findHighestOn(p[0]);
+#endif
+}// util::findHighestOn
+
+// ----------------------------> util::countOn <--------------------------------------
+
+/// @return Number of bits that are on in the specified 64-bit word
+NANOVDB_HOSTDEV_DISABLE_WARNING
+__hostdev__ inline uint32_t countOn(uint64_t v)
+{
+#if (defined(__CUDA_ARCH__) || defined(__HIP__)) && defined(NANOVDB_USE_INTRINSICS)
+    //#warning Using popcll for util::countOn
+    return __popcll(v);
+// __popcnt64 intrinsic support was added in VS 2019 16.8
+#elif defined(_MSC_VER) && defined(_M_X64) && (_MSC_VER >= 1928) && defined(NANOVDB_USE_INTRINSICS)
+    //#warning Using popcnt64 for util::countOn
+    return uint32_t(__popcnt64(v));
+#elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS)
+    //#warning Using builtin_popcountll for util::countOn
+    return __builtin_popcountll(v);
+#else // use software implementation
+    //NANO_WARNING("Using software implementation for util::countOn")
+    v = v - ((v >> 1) & uint64_t(0x5555555555555555));
+    v = (v & uint64_t(0x3333333333333333)) + ((v >> 2) & uint64_t(0x3333333333333333));
+    return (((v + (v >> 4)) & uint64_t(0xF0F0F0F0F0F0F0F)) * uint64_t(0x101010101010101)) >> 56;
+#endif
+}// util::countOn(uint64_t)
+
+}// namespace util ==================================================================
+
+[[deprecated("Use nanovdb::util::findLowestOn instead")]]
+__hostdev__ inline uint32_t FindLowestOn(uint32_t v){return util::findLowestOn(v);}
+[[deprecated("Use nanovdb::util::findLowestOn instead")]]
+__hostdev__ inline uint32_t FindLowestOn(uint64_t v){return util::findLowestOn(v);}
+[[deprecated("Use nanovdb::util::findHighestOn instead")]]
+__hostdev__ inline uint32_t FindHighestOn(uint32_t v){return util::findHighestOn(v);}
+[[deprecated("Use nanovdb::util::findHighestOn instead")]]
+__hostdev__ inline uint32_t FindHighestOn(uint64_t v){return util::findHighestOn(v);}
+[[deprecated("Use nanovdb::util::countOn instead")]]
+__hostdev__ inline uint32_t CountOn(uint64_t v){return util::countOn(v);}
+
+} // namespace nanovdb ===================================================================
+
+#endif // end of NANOVDB_UTIL_UTIL_H_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/util/cuda/CudaAddBlindData.cuh b/nanovdb/nanovdb/util/cuda/CudaAddBlindData.cuh
index c750412458..d366bd9845 100644
--- a/nanovdb/nanovdb/util/cuda/CudaAddBlindData.cuh
+++ b/nanovdb/nanovdb/util/cuda/CudaAddBlindData.cuh
@@ -1,127 +1,6 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-/*!
-    \file CudaAddBlindData.cuh
-
-    \author Ken Museth
-
-    \date August 3, 2023
-
-    \brief Defines function that appends blind device data to and existing device NanoGrid
-
-    \warning The header file contains cuda device code so be sure
-             to only include it in .cu files (or other .cuh files)
-*/
-
-#ifndef NVIDIA_CUDA_ADD_BLIND_DATA_CUH_HAS_BEEN_INCLUDED
-#define NVIDIA_CUDA_ADD_BLIND_DATA_CUH_HAS_BEEN_INCLUDED
-
-#include <nanovdb/NanoVDB.h>
-#include "CudaDeviceBuffer.h"
-#include <nanovdb/util/GridHandle.h>
-#include <nanovdb/util/cuda/CudaUtils.h>
-#include <nanovdb/util/GridChecksum.h>
-#include <nanovdb/util/cuda/CudaGridChecksum.cuh>
-
-#include <cstring> // for std::strcpy
-
-namespace nanovdb {
-
-/// @brief This function appends blind data to and existing NanoGrid
-/// @tparam BuildT Build type of the grid
-/// @tparam BlindDataT Type of the blind data
-/// @tparam BufferT Type of the buffer used for allocation
-/// @param d_grid Pointer to device grid
-/// @param d_blindData Pointer to device blind data
-/// @param valueCount number of values in the blind data
-/// @param blindClass class of the blind data
-/// @param semantics semantics of the blind data
-/// @param name optional name of the blind data
-/// @param pool optional pool used for allocation
-/// @param stream optional CUDA stream (defaults to CUDA stream 0)
-/// @return GridHandle with blind data appended
-template<typename BuildT, typename BlindDataT, typename BufferT = CudaDeviceBuffer>
-GridHandle<BufferT>
-cudaAddBlindData(const NanoGrid<BuildT> *d_grid,
-                 const BlindDataT *d_blindData,
-                 uint64_t valueCount,
-                 GridBlindDataClass blindClass   = GridBlindDataClass::Unknown,
-                 GridBlindDataSemantic semantics = GridBlindDataSemantic::Unknown,
-                 const char *name = "",
-                 const BufferT &pool = BufferT(),
-                 cudaStream_t stream = 0)
-{
-    // In:  |-----------|--------- |-----------|
-    //        old grid    old meta   old data
-    // Out: |-----------|----------|----------|-----------|------------|
-    //        old grid    old meta   new meta    old data    new data
-
-    static_assert(BufferTraits<BufferT>::hasDeviceDual, "Expected BufferT to support device allocation");
-
-    // extract byte sizes of the grid, blind meta data and blind data
-    enum {GRID=0, META=1, DATA=2, CHECKSUM=3};
-    uint64_t tmp[4], *d_tmp;
-    cudaCheck(cudaMallocAsync((void**)&d_tmp, 4*sizeof(uint64_t), stream));
-    cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {
-        if (auto count = d_grid->blindDataCount()) {
-            d_tmp[GRID] = PtrDiff(&d_grid->blindMetaData(0), d_grid);
-            d_tmp[META] = count*sizeof(GridBlindMetaData);
-            d_tmp[DATA] = d_grid->gridSize() - d_tmp[GRID] - d_tmp[META];
-        } else {
-            d_tmp[GRID] = d_grid->gridSize();
-            d_tmp[META] = d_tmp[DATA] = 0u;
-        }
-        d_tmp[CHECKSUM] = d_grid->checksum();
-    }); cudaCheckError();
-    cudaCheck(cudaMemcpyAsync(&tmp, d_tmp, 4*sizeof(uint64_t), cudaMemcpyDeviceToHost, stream));
-
-    GridBlindMetaData metaData{int64_t(sizeof(GridBlindMetaData) + tmp[DATA]), valueCount,
-                               sizeof(BlindDataT), semantics, blindClass, mapToGridType<BlindDataT>()};
-    if (!metaData.isValid()) throw std::runtime_error("cudaAddBlindData: invalid combination of blind meta data");
-    std::strcpy(metaData.mName, name);
-    auto buffer = BufferT::create(tmp[GRID] + tmp[META] + sizeof(GridBlindMetaData) + tmp[DATA] + metaData.blindDataSize(), &pool, false);
-    auto d_data = buffer.deviceData();
-
-    // 1:   |-----------|----------|
-    //        old grid    old meta
-    cudaCheck(cudaMemcpyAsync(d_data, d_grid, tmp[GRID] + tmp[META], cudaMemcpyDeviceToDevice, stream));
-
-    // 2:   |-----------|----------|----------|
-    //        old grid    old meta   new meta
-    cudaCheck(cudaMemcpyAsync(d_data + tmp[GRID] + tmp[META], &metaData, sizeof(GridBlindMetaData), cudaMemcpyHostToDevice, stream));
-
-    // 3:   |-----------|----------|----------|-----------|
-    //        old grid    old meta   new meta   old data
-    cudaCheck(cudaMemcpyAsync(d_data + tmp[GRID] + tmp[META] + sizeof(GridBlindMetaData),
-                 (const char*)d_grid + tmp[GRID] + tmp[META], tmp[DATA], cudaMemcpyDeviceToDevice, stream));
-
-    // 4:   |-----------|----------|----------|-----------|------------|
-    //        old grid    old meta   new meta    old data    new data
-    const size_t dataSize = valueCount*sizeof(BlindDataT);// no padding
-    cudaCheck(cudaMemcpyAsync(d_data + tmp[GRID] + tmp[META] + sizeof(GridBlindMetaData) + tmp[DATA],
-                              d_blindData, dataSize, cudaMemcpyDeviceToDevice, stream));
-    if (auto padding = metaData.blindDataSize() - dataSize) {// zero out possible padding
-        cudaCheck(cudaMemsetAsync(d_data + tmp[GRID] + tmp[META] + sizeof(GridBlindMetaData) + tmp[DATA] + dataSize, 0, padding, stream));
-    }
-
-    // increment grid size and blind data counter in output grid
-    cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {
-        auto &grid = *reinterpret_cast<NanoGrid<BuildT>*>(d_data);
-        grid.mBlindMetadataCount += 1;
-        grid.mBlindMetadataOffset = d_tmp[GRID];
-        auto *meta = PtrAdd<GridBlindMetaData>(d_data, grid.mBlindMetadataOffset);// points to first blind meta data
-        for (uint32_t i=0, n=grid.mBlindMetadataCount-1; i<n; ++i, ++meta) meta->mDataOffset += sizeof(GridBlindMetaData);
-        grid.mGridSize += sizeof(GridBlindMetaData) + meta->blindDataSize();// expansion with 32 byte alignment
-    }); cudaCheckError();
-    cudaCheck(cudaFreeAsync(d_tmp, stream));
-
-    GridChecksum cs(tmp[CHECKSUM]);
-    cudaGridChecksum(reinterpret_cast<GridData*>(d_data), cs.mode());
-
-    return GridHandle<BufferT>(std::move(buffer));
-}// cudaAddBlindData
-
-}// nanovdb namespace
-
-#endif // NVIDIA_CUDA_ADD_BLIND_DATA_CUH_HAS_BEEN_INCLUDED
\ No newline at end of file
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/cuda/AddBlindData.cuh>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/tools/cuda/AddBlindData.cuh instead.")
diff --git a/nanovdb/nanovdb/util/cuda/CudaDeviceBuffer.h b/nanovdb/nanovdb/util/cuda/CudaDeviceBuffer.h
index 4b9820771d..b05fbac802 100644
--- a/nanovdb/nanovdb/util/cuda/CudaDeviceBuffer.h
+++ b/nanovdb/nanovdb/util/cuda/CudaDeviceBuffer.h
@@ -1,194 +1,6 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-/*!
-    \file CudaDeviceBuffer.h
-
-    \author Ken Museth
-
-    \date January 8, 2020
-
-    \brief Implements a simple dual (host/device) CUDA buffer.
-
-    \note This file has no device-only (kernel) function calls,
-          which explains why it's a .h and not .cuh file.
-*/
-
-#ifndef NANOVDB_CUDA_DEVICE_BUFFER_H_HAS_BEEN_INCLUDED
-#define NANOVDB_CUDA_DEVICE_BUFFER_H_HAS_BEEN_INCLUDED
-
-#include "../HostBuffer.h" // for BufferTraits
-#include "CudaUtils.h"// for cudaMalloc/cudaMallocManaged/cudaFree
-
-namespace nanovdb {
-
-// ----------------------------> CudaDeviceBuffer <--------------------------------------
-
-/// @brief Simple memory buffer using un-managed pinned host memory when compiled with NVCC.
-///        Obviously this class is making explicit used of CUDA so replace it with your own memory
-///        allocator if you are not using CUDA.
-/// @note  While CUDA's pinned host memory allows for asynchronous memory copy between host and device
-///        it is significantly slower then cached (un-pinned) memory on the host.
-class CudaDeviceBuffer
-{
-
-    uint64_t mSize; // total number of bytes managed by this buffer (assumed to be identical for host and device)
-    uint8_t *mCpuData, *mGpuData; // raw pointers to the host and device buffers
-
-public:
-    /// @brief Static factory method that return an instance of this buffer
-    /// @param size byte size of buffer to be initialized
-    /// @param dummy this argument is currently ignored but required to match the API of the HostBuffer
-    /// @param host If true buffer is initialized only on the host/CPU, else on the device/GPU
-    /// @param stream optional stream argument (defaults to stream NULL)
-    /// @return An instance of this class using move semantics
-    static CudaDeviceBuffer create(uint64_t size, const CudaDeviceBuffer* dummy = nullptr, bool host = true, void* stream = nullptr);
-
-    /// @brief Constructor
-    /// @param size byte size of buffer to be initialized
-    /// @param host If true buffer is initialized only on the host/CPU, else on the device/GPU
-    /// @param stream optional stream argument (defaults to stream NULL)
-    CudaDeviceBuffer(uint64_t size = 0, bool host = true, void* stream = nullptr)
-        : mSize(0)
-        , mCpuData(nullptr)
-        , mGpuData(nullptr)
-    {
-        if (size > 0) this->init(size, host, stream);
-    }
-
-    /// @brief Disallow copy-construction
-    CudaDeviceBuffer(const CudaDeviceBuffer&) = delete;
-
-    /// @brief Move copy-constructor
-    CudaDeviceBuffer(CudaDeviceBuffer&& other) noexcept
-        : mSize(other.mSize)
-        , mCpuData(other.mCpuData)
-        , mGpuData(other.mGpuData)
-    {
-        other.mSize = 0;
-        other.mCpuData = nullptr;
-        other.mGpuData = nullptr;
-    }
-
-    /// @brief Disallow copy assignment operation
-    CudaDeviceBuffer& operator=(const CudaDeviceBuffer&) = delete;
-
-    /// @brief Move copy assignment operation
-    CudaDeviceBuffer& operator=(CudaDeviceBuffer&& other) noexcept
-    {
-        this->clear();
-        mSize = other.mSize;
-        mCpuData = other.mCpuData;
-        mGpuData = other.mGpuData;
-        other.mSize = 0;
-        other.mCpuData = nullptr;
-        other.mGpuData = nullptr;
-        return *this;
-    }
-
-    /// @brief Destructor frees memory on both the host and device
-    ~CudaDeviceBuffer() { this->clear(); };
-
-    /// @brief Initialize buffer
-    /// @param size byte size of buffer to be initialized
-    /// @param host If true buffer is initialized only on the host/CPU, else on the device/GPU
-    /// @note All existing buffers are first cleared
-    /// @warning size is expected to be non-zero. Use clear() clear buffer!
-    void init(uint64_t size, bool host = true, void* stream = nullptr);
-
-    /// @brief Retuns a raw pointer to the host/CPU buffer managed by this allocator.
-    /// @warning Note that the pointer can be NULL!
-    uint8_t* data() const { return mCpuData; }
-
-    /// @brief Retuns a raw pointer to the device/GPU buffer managed by this allocator.
-    /// @warning Note that the pointer can be NULL!
-    uint8_t* deviceData() const { return mGpuData; }
-
-    /// @brief  Upload this buffer from the host to the device, i.e. CPU -> GPU.
-    /// @param stream optional CUDA stream (defaults to CUDA stream 0)
-    /// @param sync if false the memory copy is asynchronous
-    /// @note If the device/GPU buffer does not exist it is first allocated
-    /// @warning Assumes that the host/CPU buffer already exists
-    void deviceUpload(void* stream = nullptr, bool sync = true) const;
-
-    /// @brief Upload this buffer from the device to the host, i.e. GPU -> CPU.
-    /// @param stream optional CUDA stream (defaults to CUDA stream 0)
-    /// @param sync if false the memory copy is asynchronous
-    /// @note If the host/CPU buffer does not exist it is first allocated
-    /// @warning Assumes that the device/GPU buffer already exists
-    void deviceDownload(void* stream = nullptr, bool sync = true) const;
-
-    /// @brief Returns the size in bytes of the raw memory buffer managed by this allocator.
-    uint64_t size() const { return mSize; }
-
-    //@{
-    /// @brief Returns true if this allocator is empty, i.e. has no allocated memory
-    bool empty() const { return mSize == 0; }
-    bool isEmpty() const { return mSize == 0; }
-    //@}
-
-    /// @brief De-allocate all memory managed by this allocator and set all pointers to NULL
-    void clear(void* stream = nullptr);
-
-}; // CudaDeviceBuffer class
-
-template<>
-struct BufferTraits<CudaDeviceBuffer>
-{
-    static constexpr bool hasDeviceDual = true;
-};
-
-// --------------------------> Implementations below <------------------------------------
-
-inline CudaDeviceBuffer CudaDeviceBuffer::create(uint64_t size, const CudaDeviceBuffer*, bool host, void* stream)
-{
-    return CudaDeviceBuffer(size, host, stream);
-}
-
-inline void CudaDeviceBuffer::init(uint64_t size, bool host, void* stream)
-{
-    if (mSize>0) this->clear(stream);
-    NANOVDB_ASSERT(size > 0);
-    if (host) {
-        cudaCheck(cudaMallocHost((void**)&mCpuData, size)); // un-managed pinned memory on the host (can be slow to access!). Always 32B aligned
-        checkPtr(mCpuData, "CudaDeviceBuffer::init: failed to allocate host buffer");
-    } else {
-        cudaCheck(cudaMallocAsync((void**)&mGpuData, size, reinterpret_cast<cudaStream_t>(stream))); // un-managed memory on the device, always 32B aligned!
-        checkPtr(mGpuData, "CudaDeviceBuffer::init: failed to allocate device buffer");
-    }
-    mSize = size;
-} // CudaDeviceBuffer::init
-
-inline void CudaDeviceBuffer::deviceUpload(void* stream, bool sync) const
-{
-    checkPtr(mCpuData, "uninitialized cpu data");
-    if (mGpuData == nullptr) {
-        cudaCheck(cudaMallocAsync((void**)&mGpuData, mSize, reinterpret_cast<cudaStream_t>(stream))); // un-managed memory on the device, always 32B aligned!
-    }
-    checkPtr(mGpuData, "uninitialized gpu data");
-    cudaCheck(cudaMemcpyAsync(mGpuData, mCpuData, mSize, cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream)));
-    if (sync) cudaCheck(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream)));
-} // CudaDeviceBuffer::gpuUpload
-
-inline void CudaDeviceBuffer::deviceDownload(void* stream, bool sync) const
-{
-    checkPtr(mGpuData, "uninitialized gpu data");
-    if (mCpuData == nullptr) {
-        cudaCheck(cudaMallocHost((void**)&mCpuData, mSize)); // un-managed pinned memory on the host (can be slow to access!). Always 32B aligned
-    }
-    checkPtr(mCpuData, "uninitialized cpu data");
-    cudaCheck(cudaMemcpyAsync(mCpuData, mGpuData, mSize, cudaMemcpyDeviceToHost, reinterpret_cast<cudaStream_t>(stream)));
-    if (sync) cudaCheck(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream)));
-} // CudaDeviceBuffer::gpuDownload
-
-inline void CudaDeviceBuffer::clear(void *stream)
-{
-    if (mGpuData) cudaCheck(cudaFreeAsync(mGpuData, reinterpret_cast<cudaStream_t>(stream)));
-    if (mCpuData) cudaCheck(cudaFreeHost(mCpuData));
-    mCpuData = mGpuData = nullptr;
-    mSize = 0;
-} // CudaDeviceBuffer::clear
-
-} // namespace nanovdb
-
-#endif // end of NANOVDB_CUDA_DEVICE_BUFFER_H_HAS_BEEN_INCLUDED
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/cuda/DeviceBuffer.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/cuda/DeviceBuffer.h instead.")
\ No newline at end of file
diff --git a/nanovdb/nanovdb/util/cuda/CudaGridChecksum.cuh b/nanovdb/nanovdb/util/cuda/CudaGridChecksum.cuh
index e3ae9a941f..e52ee89ac4 100644
--- a/nanovdb/nanovdb/util/cuda/CudaGridChecksum.cuh
+++ b/nanovdb/nanovdb/util/cuda/CudaGridChecksum.cuh
@@ -1,244 +1,6 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-/*!
-    \file CudaGridChecksum.cuh
-
-    \author Ken Museth
-
-    \date September 28, 2023
-
-    \brief Compute CRC32 checksum of NanoVDB grids
-
-*/
-
-#ifndef NANOVDB_CUDA_GRID_CHECKSUM_CUH_HAS_BEEN_INCLUDED
-#define NANOVDB_CUDA_GRID_CHECKSUM_CUH_HAS_BEEN_INCLUDED
-
-#include "CudaDeviceBuffer.h"// required for instantiation of move c-tor of GridHandle
-#include "CudaNodeManager.cuh"
-#include "../GridChecksum.h"// for
-#include "../GridHandle.h"
-
-namespace nanovdb {
-
-namespace crc32 {
-
-/// @bried Cuda kernel to initiate lookup table for CRC32 computation
-/// @tparam T Dummy template parameter used to avoid multiple instantiations. T should be uint32_t!
-/// @param d_lut Device pointer to lookup table of size 256
-template <typename T>
-__global__ void initLutKernel(T *d_lut)
-{
-    static_assert(is_same<T, uint32_t>::value,"Expected uint32_t");
-    const uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid < 256u) crc32::initLut(d_lut, tid);
-}
-
-/// @brief Cuda kernel that computes CRC32 checksums of blocks of data using a look-up-table
-/// @param d_data device pointer to raw data from wich to compute the CRC32 checksums
-/// @param d_blockCRC device pointer to array of @c blockCount checksums for each block
-/// @param blockCount number of blocks and checksums
-/// @param blockSize size of each block in bytes
-/// @param d_lut device pointer to CRC32 Lookup Table
-template <typename T>
-__global__ void checksumKernel(const T *d_data, uint32_t* d_blockCRC, uint32_t blockCount, uint32_t blockSize, const uint32_t *d_lut)
-{
-    const uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid < blockCount) d_blockCRC[tid] = crc32::checksum((const uint8_t*)d_data + tid * blockSize, blockSize, d_lut);
-}
-
-/// @brief Cuda kernel that computes CRC32 checksums of blocks of data (without using a look-up-table)
-/// @param d_data device pointer to raw data from wich to compute the CRC32 checksums
-/// @param d_blockCRC device pointer to array of @c blockCount checksums for each block
-/// @param blockCount number of blocks and checksums
-/// @param blockSize size of each block in bytes
-template <typename T>
-__global__ void checksumKernel(const T *d_data, uint32_t* d_blockCRC, uint32_t blockCount, uint32_t blockSize)
-{
-    const uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid < blockCount) d_blockCRC[tid] = crc32::checksum((const uint8_t*)d_data + tid * blockSize, blockSize);
-}
-
-/// @brief Host function to allocate and initiate a Look-Up-Table of size 256 for subsequent CRC32 computation on the device
-/// @param stream optional cuda stream (defaults to zero)
-/// @return returns a device point to a lookup-table for CRC32 computation
-/// @warning It is the responsibility of the caller to delete the returned array
-inline uint32_t* cudaCreateLut(cudaStream_t stream = 0)
-{
-    uint32_t *d_lut;
-    cudaCheck(cudaMallocAsync((void**)&d_lut, 256*sizeof(uint32_t), stream));
-    initLutKernel<<<1, 256, 0, stream>>>(d_lut);
-    cudaCheckError();
-    return d_lut;
-}
-
-}// namespace crc
-
-#ifdef NANOVDB_CRC32_LOG2_BLOCK_SIZE// new approach computes CRC32 checksums for each 4 KB block
-
-/// @brief Update the checksum of a device grid
-/// @param d_gridData device pointer to GridData
-/// @param mode Mode of computation for the checksum.
-/// @param stream optional cuda stream (defaults to zero)
-/// @return The actual mode used for checksum computation. Eg. if @c d_gridData is NULL (or @c mode = ChecksumMode::Disable)
-///         then ChecksumMode::Disable is always returned. Elseif the grid has no nodes or blind data ChecksumMode::Partial
-///         is always returnd (even if @c mode = ChecksumMode::Full).
-inline ChecksumMode cudaGridChecksum(GridData *d_gridData, ChecksumMode mode = ChecksumMode::Partial, cudaStream_t stream = 0)
-{
-    if (d_gridData == nullptr || mode == ChecksumMode::Disable) return ChecksumMode::Disable;
-
-    static constexpr unsigned int mNumThreads = 128;// seems faster than the old value of 256!
-    auto numBlocks = [&](unsigned int n)->unsigned int{return (n + mNumThreads - 1) / mNumThreads;};
-    uint8_t  *d_begin = reinterpret_cast<uint8_t*>(d_gridData);
-    uint32_t *d_lut = crc32::cudaCreateLut(stream);// allocate and generate device LUT for CRC32
-    uint64_t size[2], *d_size;// {total size of grid, partial size for first checksum}
-    cudaCheck(cudaMallocAsync((void**)&d_size, 2*sizeof(uint64_t), stream));
-
-    // Compute CRC32 checksum of GridData, TreeData, RootData (+tiles), but exclude GridData::mMagic and GridData::mChecksum
-    cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {
-        d_size[0] = d_gridData->mGridSize;
-        uint8_t *d_mid = d_gridData->template nodePtr<2>();
-        if (d_mid == nullptr) {// no upper nodes
-            if (d_gridData->mBlindMetadataCount) {
-                d_mid = d_begin + d_gridData->mBlindMetadataOffset;// exclude blind data from partial checksum
-            } else {
-                d_mid = d_begin + d_gridData->mGridSize;// no nodes or blind data, so partial checksum is computed on the entire grid buffer
-            }
-        }
-        d_size[1] = d_mid - d_begin;
-        uint32_t *p = reinterpret_cast<uint32_t*>(&(d_gridData->mChecksum));
-        p[0] = crc32::checksum(d_begin + 16u, d_mid, d_lut);// exclude GridData::mMagic and GridData::mChecksum
-    });
-    cudaCheckError();
-    cudaCheck(cudaMemcpyAsync(size, d_size, 2*sizeof(uint64_t), cudaMemcpyDeviceToHost, stream));
-    cudaCheck(cudaFreeAsync(d_size, stream));
-
-    if (mode != ChecksumMode::Full || size[0] == size[1]) return ChecksumMode::Partial;
-
-    // Compute CRC32 checksum of 4K block of everything remaining in the buffer, i.e. nodes and blind data
-    const uint8_t *d_mid = d_begin + size[1], *d_end = d_begin + size[0];
-    uint32_t *d_checksums;// 4096 byte chunks
-    const uint64_t checksumCount = (d_end - d_mid) >> NANOVDB_CRC32_LOG2_BLOCK_SIZE;// 4 KB (4096 byte)
-    cudaCheck(cudaMallocAsync((void**)&d_checksums, checksumCount*sizeof(uint32_t), stream));
-    cudaLambdaKernel<<<numBlocks(checksumCount), mNumThreads, 0, stream>>>(checksumCount, [=] __device__(size_t tid) {
-        uint32_t size = 1<<NANOVDB_CRC32_LOG2_BLOCK_SIZE;
-        if (tid+1 == checksumCount) size += d_end - d_mid - (checksumCount<<NANOVDB_CRC32_LOG2_BLOCK_SIZE);
-        d_checksums[tid] = crc32::checksum(d_mid + (tid<<NANOVDB_CRC32_LOG2_BLOCK_SIZE), size, d_lut);
-    });
-    // Compute a final CRC32 checksum of all the 4K blocks
-    cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {
-        uint32_t *p = reinterpret_cast<uint32_t*>(&(d_gridData->mChecksum));
-        p[1] = crc32::checksum((const uint8_t*)d_checksums, checksumCount*sizeof(uint32_t), d_lut);
-    });
-    cudaCheckError();
-    cudaCheck(cudaFreeAsync(d_checksums, stream));
-    cudaCheck(cudaFreeAsync(d_lut, stream));
-
-    return ChecksumMode::Full;
-}// cudaGridChecksum
-
-template <typename BuildT>
-inline ChecksumMode cudaGridChecksum(NanoGrid<BuildT> *d_grid, ChecksumMode mode = ChecksumMode::Partial, cudaStream_t stream = 0)
-{
-    return cudaGridChecksum(reinterpret_cast<GridData*>(d_grid), mode, stream);
-}
-
-inline GridChecksum cudaGetGridChecksum(GridData *d_gridData, cudaStream_t stream = 0)
-{
-    uint64_t checksum, *d_checksum;
-    cudaCheck(cudaMallocAsync((void**)&d_checksum, sizeof(uint64_t), stream));
-    cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {*d_checksum = d_gridData->mChecksum;});
-    cudaCheckError();
-    cudaCheck(cudaMemcpyAsync(&checksum, d_checksum, sizeof(uint64_t), cudaMemcpyDeviceToHost, stream));
-    cudaCheck(cudaFreeAsync(d_checksum, stream));
-    return GridChecksum(checksum);;
-}
-
-inline ChecksumMode cudaUpdateGridChecksum(GridData *d_gridData, cudaStream_t stream = 0)
-{
-    return cudaGridChecksum(d_gridData, cudaGetGridChecksum(d_gridData, stream).mode(), stream);
-}
-
-#else
-
-template <typename ValueT>
-void cudaGridChecksum(NanoGrid<ValueT> *d_grid, ChecksumMode mode = ChecksumMode::Partial, cudaStream_t stream = 0)
-{
-    if (d_grid == nullptr || mode == ChecksumMode::Disable) return;
-
-    static constexpr unsigned int mNumThreads = 128;// seems faster than the old value of 256!
-    auto numBlocks = [&](unsigned int n)->unsigned int{return (n + mNumThreads - 1) / mNumThreads;};
-
-    uint32_t *d_lut = crc32::cudaCreateLut(stream);// allocate and generate device LUT for CRC32
-    uint64_t size[2], *d_size;
-    cudaCheck(cudaMallocAsync((void**)&d_size, 2*sizeof(uint64_t), stream));
-    cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {
-        d_size[0] = d_grid->gridSize();
-        d_size[1] = d_grid->memUsage() + d_grid->tree().memUsage() + d_grid->tree().root().memUsage();
-        const uint8_t *begin = reinterpret_cast<const uint8_t*>(d_grid);
-        uint32_t *p = reinterpret_cast<uint32_t*>(&(d_grid->mChecksum));
-        p[0] = crc32::checksum(begin + 16u, begin + d_size[1], d_lut);// exclude mMagic and mChecksum
-    });
-    cudaCheckError();
-    cudaCheck(cudaMemcpyAsync(size, d_size, 2*sizeof(uint64_t), cudaMemcpyDeviceToHost, stream));
-    cudaCheckError();
-
-    if (mode != ChecksumMode::Full) return;
-
-    // Get node counts
-    uint32_t nodeCount[3], *d_nodeCount, *d_checksums, *d_ptr;
-    cudaCheck(cudaMallocAsync((void**)&d_nodeCount, 3*sizeof(uint32_t), stream));
-    cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {
-        auto &tree = d_grid->tree();
-        for (int i = 0; i < 3; ++i) d_nodeCount[i] = tree.nodeCount(i);
-    });
-    cudaCheckError();
-    cudaCheck(cudaMemcpyAsync(nodeCount, d_nodeCount, 3*sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
-    cudaCheck(cudaFreeAsync(d_nodeCount, stream));
-    cudaCheck(cudaMallocAsync((void**)&d_checksums, (nodeCount[0]+nodeCount[1]+nodeCount[2])*sizeof(uint32_t), stream));
-
-    auto nodeMgrHandle = cudaCreateNodeManager<ValueT, CudaDeviceBuffer>(d_grid, CudaDeviceBuffer(), stream);
-    auto *d_nodeMgr = nodeMgrHandle.template deviceMgr<ValueT>();
-    NANOVDB_ASSERT(isValid(d_nodeMgr));
-    d_ptr = d_checksums;
-
-    // very slow due to large nodes
-    cudaLambdaKernel<<<numBlocks(nodeCount[2]), mNumThreads, 0, stream>>>(nodeCount[2], [=] __device__(size_t tid) {
-        auto &node = d_nodeMgr->upper(uint32_t(tid));
-        d_ptr[tid] = crc32::checksum((const uint8_t*)&node, node.memUsage(), d_lut);
-    });
-    cudaCheckError();
-
-    d_ptr += nodeCount[2];
-    cudaLambdaKernel<<<numBlocks(nodeCount[1]), mNumThreads, 0, stream>>>(nodeCount[1], [=] __device__(size_t tid) {
-        auto &node = d_nodeMgr->lower(uint32_t(tid));
-        d_ptr[tid] = crc32::checksum((const uint8_t*)&node, node.memUsage(), d_lut);
-    });
-    cudaCheckError();
-
-    d_ptr += nodeCount[1];
-    cudaLambdaKernel<<<numBlocks(nodeCount[0]), mNumThreads, 0, stream>>>(nodeCount[0], [=] __device__(size_t tid) {
-        auto &node = d_nodeMgr->leaf(uint32_t(tid));
-        d_ptr[tid] = crc32::checksum((const uint8_t*)&node, node.memUsage(), d_lut);
-    });
-    cudaCheckError();
-
-    // to-do: process blind data
-    cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {
-        uint32_t *p = reinterpret_cast<uint32_t*>(&(d_grid->mChecksum));
-        const uint8_t *begin = reinterpret_cast<const uint8_t*>(d_checksums);
-        p[1] = crc32::checksum(begin, d_nodeMgr->tree().totalNodeCount()*sizeof(uint32_t), d_lut);
-    });
-    cudaCheckError();
-
-    cudaCheck(cudaFreeAsync(d_size, stream));
-    cudaCheck(cudaFreeAsync(d_checksums, stream));
-    cudaCheck(cudaFreeAsync(d_lut, stream));
-}// cudaGridChecksum
-
-#endif
-
-}// namespace nanovdb
-
-#endif // NANOVDB_CUDA_GRID_CHECKSUM_CUH_HAS_BEEN_INCLUDED
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/cuda/GridChecksum.cuh>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/tools/cuda/GridChecksum.cuh instead.")
diff --git a/nanovdb/nanovdb/util/cuda/CudaGridHandle.cuh b/nanovdb/nanovdb/util/cuda/CudaGridHandle.cuh
index 5446c56231..9e0c0faeb4 100644
--- a/nanovdb/nanovdb/util/cuda/CudaGridHandle.cuh
+++ b/nanovdb/nanovdb/util/cuda/CudaGridHandle.cuh
@@ -1,134 +1,6 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-/*!
-    \file CudaGridHandle.cuh
-
-    \author Ken Museth, Doyub Kim
-
-    \date August 3, 2023
-
-    \brief Contains cuda kernels for GridHandle
-
-    \warning The header file contains cuda device code so be sure
-             to only include it in .cu files (or other .cuh files)
-*/
-
-#ifndef NANOVDB_CUDA_GRID_HANDLE_CUH_HAS_BEEN_INCLUDED
-#define NANOVDB_CUDA_GRID_HANDLE_CUH_HAS_BEEN_INCLUDED
-
-#include "CudaDeviceBuffer.h"// required for instantiation of move c-tor of GridHandle
-#include "CudaGridChecksum.cuh"// for cudaUpdateChecksum
-#include "../GridHandle.h"
-
-namespace nanovdb {
-
-namespace {// anonymous namespace
-__global__ void cudaCpyMetaData(const GridData *data, GridHandleMetaData *meta){cpyMetaData(data, meta);}
-__global__ void cudaUpdateGridCount(GridData *data, uint32_t gridIndex, uint32_t gridCount, bool *d_dirty){
-    NANOVDB_ASSERT(gridIndex < gridCount);
-    if (*d_dirty = data->mGridIndex != gridIndex || data->mGridCount != gridCount) {
-        data->mGridIndex = gridIndex;
-        data->mGridCount = gridCount;
-        if (data->mChecksum == GridChecksum::EMPTY) *d_dirty = false;// no need to update checksum if it didn't already exist
-    }
-}
-}// anonymous namespace
-
-template<typename BufferT>
-template<typename T, typename enable_if<BufferTraits<T>::hasDeviceDual, int>::type>
-GridHandle<BufferT>::GridHandle(T&& buffer)
-{
-    static_assert(is_same<T,BufferT>::value, "Expected U==BufferT");
-    mBuffer = std::move(buffer);
-    if (auto *data = reinterpret_cast<const GridData*>(mBuffer.data())) {
-        if (!data->isValid()) throw std::runtime_error("GridHandle was constructed with an invalid host buffer");
-        mMetaData.resize(data->mGridCount);
-        cpyMetaData(data, mMetaData.data());
-    } else {
-        if (auto *d_data = reinterpret_cast<const GridData*>(mBuffer.deviceData())) {
-            GridData tmp;
-            cudaCheck(cudaMemcpy(&tmp, d_data, sizeof(GridData), cudaMemcpyDeviceToHost));
-            if (!tmp.isValid()) throw std::runtime_error("GridHandle was constructed with an invalid device buffer");
-            GridHandleMetaData *d_metaData;
-            cudaMalloc((void**)&d_metaData, tmp.mGridCount*sizeof(GridHandleMetaData));
-            cudaCpyMetaData<<<1,1>>>(d_data, d_metaData);
-            mMetaData.resize(tmp.mGridCount);
-            cudaCheck(cudaMemcpy(mMetaData.data(), d_metaData,tmp.mGridCount*sizeof(GridHandleMetaData), cudaMemcpyDeviceToHost));
-            cudaCheck(cudaFree(d_metaData));
-        }
-    }
-}// GridHandle(T&& buffer)
-
-// Dummy function that ensures instantiation of the move-constructor above when BufferT=CudaDeviceBuffer
-namespace {auto __dummy(){return GridHandle<CudaDeviceBuffer>(std::move(CudaDeviceBuffer()));}}
-
-template<typename BufferT, template <class, class...> class VectorT = std::vector>
-inline typename enable_if<BufferTraits<BufferT>::hasDeviceDual, VectorT<GridHandle<BufferT>>>::type
-cudaSplitGridHandles(const GridHandle<BufferT> &handle, const BufferT* other = nullptr, cudaStream_t stream = 0)
-{
-    const uint8_t *ptr = handle.deviceData();
-    if (ptr == nullptr) return VectorT<GridHandle<BufferT>>();
-    VectorT<GridHandle<BufferT>> handles(handle.gridCount());
-    bool dirty, *d_dirty;// use this to check if the checksum needs to be recomputed
-    cudaCheck(cudaMallocAsync((void**)&d_dirty, sizeof(bool), stream));
-    for (uint32_t n=0; n<handle.gridCount(); ++n) {
-        auto buffer = BufferT::create(handle.gridSize(n), other, false, stream);
-        GridData *dst = reinterpret_cast<GridData*>(buffer.deviceData());
-        const GridData *src = reinterpret_cast<const GridData*>(ptr);
-        cudaCheck(cudaMemcpyAsync(dst, src, handle.gridSize(n), cudaMemcpyDeviceToDevice, stream));
-        cudaUpdateGridCount<<<1, 1, 0, stream>>>(dst, 0u, 1u, d_dirty);
-        cudaCheckError();
-        cudaCheck(cudaMemcpyAsync(&dirty, d_dirty, sizeof(bool), cudaMemcpyDeviceToHost, stream));
-        if (dirty) cudaGridChecksum(dst, ChecksumMode::Partial);
-        handles[n] = GridHandle<BufferT>(std::move(buffer));
-        ptr += handle.gridSize(n);
-    }
-    cudaCheck(cudaFreeAsync(d_dirty, stream));
-    return std::move(handles);
-}// cudaSplitGridHandles
-
-template<typename BufferT, template <class, class...> class VectorT = std::vector>
-inline typename enable_if<BufferTraits<BufferT>::hasDeviceDual, VectorT<GridHandle<BufferT>>>::type
-splitDeviceGrids(const GridHandle<BufferT> &handle, const BufferT* other = nullptr, cudaStream_t stream = 0)
-{ return cudaSplitGridHandles(handle, other, stream); }
-
-template<typename BufferT, template <class, class...> class VectorT>
-inline typename enable_if<BufferTraits<BufferT>::hasDeviceDual, GridHandle<BufferT>>::type
-cudaMergeGridHandles(const VectorT<GridHandle<BufferT>> &handles, const BufferT* other = nullptr, cudaStream_t stream = 0)
-{
-    uint64_t size = 0u;
-    uint32_t counter = 0u, gridCount = 0u;
-    for (auto &h : handles) {
-        gridCount += h.gridCount();
-        for (uint32_t n=0; n<h.gridCount(); ++n) size += h.gridSize(n);
-    }
-    auto buffer = BufferT::create(size, other, false, stream);
-    uint8_t *dst = buffer.deviceData();
-    bool dirty, *d_dirty;// use this to check if the checksum needs to be recomputed
-    cudaCheck(cudaMallocAsync((void**)&d_dirty, sizeof(bool), stream));
-    for (auto &h : handles) {
-        const uint8_t *src = h.deviceData();
-        for (uint32_t n=0; n<h.gridCount(); ++n) {
-            cudaCheck(cudaMemcpyAsync(dst, src, h.gridSize(n), cudaMemcpyDeviceToDevice, stream));
-            GridData *data = reinterpret_cast<GridData*>(dst);
-            cudaUpdateGridCount<<<1, 1, 0, stream>>>(data, counter++, gridCount, d_dirty);
-            cudaCheckError();
-            cudaCheck(cudaMemcpyAsync(&dirty, d_dirty, sizeof(bool), cudaMemcpyDeviceToHost, stream));
-            if (dirty) cudaGridChecksum(data, ChecksumMode::Partial);
-            dst += h.gridSize(n);
-            src += h.gridSize(n);
-        }
-    }
-    cudaCheck(cudaFreeAsync(d_dirty, stream));
-    return GridHandle<BufferT>(std::move(buffer));
-}// cudaMergeGridHandles
-
-template<typename BufferT, template <class, class...> class VectorT>
-inline typename enable_if<BufferTraits<BufferT>::hasDeviceDual, GridHandle<BufferT>>::type
-mergeDeviceGrids(const VectorT<GridHandle<BufferT>> &handles, const BufferT* other = nullptr, cudaStream_t stream = 0)
-{ return cudaMergeGridHandles<BufferT, VectorT>(handles, other, stream); }
-
-} // namespace nanovdb
-
-#endif // NANOVDB_CUDA_GRID_HANDLE_CUH_HAS_BEEN_INCLUDED
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/cuda/GridHandle.cuh>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/cuda/GridHandle.cuh instead.")
\ No newline at end of file
diff --git a/nanovdb/nanovdb/util/cuda/CudaGridStats.cuh b/nanovdb/nanovdb/util/cuda/CudaGridStats.cuh
index dcf5bfc850..64c6490768 100644
--- a/nanovdb/nanovdb/util/cuda/CudaGridStats.cuh
+++ b/nanovdb/nanovdb/util/cuda/CudaGridStats.cuh
@@ -1,250 +1,6 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-/*!
-    \file CudaGridStats.cuh
-
-    \author Ken Museth
-
-    \date October 9, 2023
-
-    \brief Re-computes min/max/avg/var/bbox information for each node in a
-           pre-existing NanoVDB grid on the device.
-*/
-
-#ifndef NANOVDB_CUDAGRIDSTATS_CUH_HAS_BEEN_INCLUDED
-#define NANOVDB_CUDAGRIDSTATS_CUH_HAS_BEEN_INCLUDED
-
-#include <nanovdb/NanoVDB.h>
-#include <nanovdb/util/GridStats.h>
-
-namespace nanovdb {
-
-/// @brief Re-computes the min/max, stats and bbox information for an existing NanoVDB Grid
-///
-/// @param grid   Grid whose stats to update
-/// @param mode   Mode of computation for the statistics.
-/// @param stream Optional cuda stream (defaults to zero)
-template<typename BuildT>
-void cudaGridStats(NanoGrid<BuildT> *d_grid, StatsMode mode = StatsMode::Default, cudaStream_t stream = 0);
-
-//================================================================================================
-
-/// @brief Allows for the construction of NanoVDB grids without any dependecy
-template<typename BuildT, typename StatsT = Stats<typename NanoGrid<BuildT>::ValueType>>
-class CudaGridStats
-{
-    using GridT  = NanoGrid<BuildT>;
-    using TreeT  = typename GridT::TreeType;
-    using ValueT = typename TreeT::ValueType;
-    using Node0  = typename TreeT::Node0; // leaf
-    using Node1  = typename TreeT::Node1; // lower
-    using Node2  = typename TreeT::Node2; // upper
-    using RootT  = typename TreeT::Node3; // root
-    static_assert(is_same<ValueT, typename StatsT::ValueType>::value, "Mismatching type");
-
-    ValueT mDelta; // skip rendering of node if: node.max < -mDelta || node.min > mDelta
-
-public:
-    CudaGridStats(ValueT delta = ValueT(0)) : mDelta(delta) {}
-
-    void operator()(GridT *d_grid, cudaStream_t stream = 0);
-
-}; // CudaGridStats
-
-//================================================================================================
-
-namespace {// define cuda kernels in an unnamed namespace
-
-template<typename BuildT, typename StatsT>
-__global__ void processLeaf(NodeManager<BuildT> *d_nodeMgr, StatsT *d_stats)
-{
-    const uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= d_nodeMgr->leafCount()) return;
-    auto &d_leaf = d_nodeMgr->leaf(tid);
-
-    if (d_leaf.updateBBox()) {// updates active bounding box (also updates data->mFlags) and return true if non-empty
-        if constexpr(StatsT::hasStats()) {
-            StatsT stats;
-            for (auto it = d_leaf.cbeginValueOn(); it; ++it) stats.add(*it);
-            if constexpr(StatsT::hasAverage()) {
-                d_stats[tid] = stats;
-                *reinterpret_cast<uint32_t*>(&d_leaf.mMinimum) = tid;
-            } else {
-                stats.setStats(d_leaf);
-            }
-        }
-    }
-    d_leaf.mFlags &= ~uint8_t(1u);// enable rendering
-}// processLeaf
-
-template<typename BuildT, typename StatsT, int LEVEL>
-__global__ void processInternal(NodeManager<BuildT> *d_nodeMgr, StatsT *d_stats)
-{
-    using ChildT = typename NanoNode<BuildT,LEVEL-1>::type;
-    const uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= d_nodeMgr->nodeCount(LEVEL)) return;
-    auto &d_node = d_nodeMgr->template node<LEVEL>(tid);
-    auto &bbox   = d_node.mBBox;
-    bbox         = CoordBBox();// empty bbox
-    StatsT stats;
-    uint32_t childID = 0u;
-
-    for (auto it = d_node.beginChild(); it; ++it) {
-        auto &child = *it;
-        bbox.expand( child.bbox() );
-        if constexpr(StatsT::hasAverage()) {
-            childID = *reinterpret_cast<uint32_t*>(&child.mMinimum);
-            StatsT &s = d_stats[childID];
-            s.setStats(child);
-            stats.add(s);
-        } else if constexpr(StatsT::hasMinMax()) {
-            stats.add(child.minimum());
-            stats.add(child.maximum());
-        }
-    }
-    for (auto it = d_node.cbeginValueOn(); it; ++it) {
-        const Coord ijk = it.getCoord();
-        bbox[0].minComponent(ijk);
-        bbox[1].maxComponent(ijk + Coord(ChildT::DIM - 1));
-        if constexpr(StatsT::hasStats()) stats.add(*it, ChildT::NUM_VALUES);
-    }
-    if constexpr(StatsT::hasAverage()) {
-        d_stats[childID] = stats;
-        *reinterpret_cast<uint32_t*>(&d_node.mMinimum) = childID;
-    } else if constexpr(StatsT::hasMinMax()) {
-        stats.setStats(d_node);
-    }
-    d_node.mFlags &= ~uint64_t(1u);// enable rendering
-}// processInternal
-
-template<typename BuildT, typename StatsT>
-__global__ void processRootAndGrid(NodeManager<BuildT> *d_nodeMgr, StatsT *d_stats)
-{
-    using ChildT = NanoUpper<BuildT>;
-    using ValueT = typename ChildT::ValueType;
-
-    // process root
-    auto &root = d_nodeMgr->root();
-    root.mBBox = CoordBBox();
-    if (root.isEmpty()) {
-        root.mMinimum = root.mMaximum = root.mBackground;
-        root.mAverage = root.mStdDevi = 0;
-    } else {
-        ValueT v;
-        StatsT s;
-        for (auto it = root.beginDense(); it; ++it) {
-            if (auto *child = it.probeChild(v)) {
-                root.mBBox.expand( child->bbox() );
-                if constexpr(StatsT::hasAverage()) {
-                    StatsT &stats = d_stats[*reinterpret_cast<uint32_t*>(&child->mMinimum)];
-                    stats.setStats(*child);
-                    s.add(stats);
-                } else if constexpr(StatsT::hasMinMax()){
-                    s.add(child->minimum());
-                    s.add(child->maximum());
-                }
-            } else if (it.isValueOn()) {
-                const Coord ijk = it.getCoord();
-                root.mBBox[0].minComponent(ijk);
-                root.mBBox[1].maxComponent(ijk + Coord(ChildT::DIM - 1));
-                if constexpr(StatsT::hasStats()) s.add(v, ChildT::NUM_VALUES);
-            }
-        }
-        s.setStats(root);
-    }
-
-    // process Grid
-    auto& grid = d_nodeMgr->grid();
-    const auto& indexBBox = root.bbox();
-    if (indexBBox.empty()) {
-        grid.mWorldBBox = BBox<Vec3d>();
-        grid.setBBoxOn(false);
-    } else {
-        // Note that below max is offset by one since CoordBBox.max is inclusive
-        // while bbox<Vec3d>.max is exclusive. However, min is inclusive in both
-        // CoordBBox and BBox<Vec3d>. This also guarantees that a grid with a single
-        // active voxel, does not have an empty world bbox! E.g. if a grid with a
-        // unit index-to-world transformation only contains the active voxel (0,0,0)
-        // then indeBBox = (0,0,0) -> (0,0,0) and then worldBBox = (0.0, 0.0, 0.0)
-        // -> (1.0, 1.0, 1.0). This is a consequence of the different definitions
-        // of index and world bounding boxes inherited from OpenVDB!
-        const Coord min = indexBBox[0];
-        const Coord max = indexBBox[1] + Coord(1);
-
-        auto& wBBox = grid.mWorldBBox;
-        const auto& map = grid.map();
-        wBBox[0] = wBBox[1] = map.applyMap(Vec3d(min[0], min[1], min[2]));
-        wBBox.expand(map.applyMap(Vec3d(min[0], min[1], max[2])));
-        wBBox.expand(map.applyMap(Vec3d(min[0], max[1], min[2])));
-        wBBox.expand(map.applyMap(Vec3d(max[0], min[1], min[2])));
-        wBBox.expand(map.applyMap(Vec3d(max[0], max[1], min[2])));
-        wBBox.expand(map.applyMap(Vec3d(max[0], min[1], max[2])));
-        wBBox.expand(map.applyMap(Vec3d(min[0], max[1], max[2])));
-        wBBox.expand(map.applyMap(Vec3d(max[0], max[1], max[2])));
-        grid.setBBoxOn(true);
-    }
-
-    // set bit flags
-    grid.setMinMaxOn(StatsT::hasMinMax());
-    grid.setAverageOn(StatsT::hasAverage());
-    grid.setStdDeviationOn(StatsT::hasStdDeviation());
-}// processRootAndGrid
-
-}// cuda kernels are defined in an unnamed namespace
-
-//================================================================================================
-
-template<typename BuildT, typename StatsT>
-void CudaGridStats<BuildT, StatsT>::operator()(NanoGrid<BuildT> *d_grid, cudaStream_t stream)
-{
-    static const uint32_t threadsPerBlock = 128;
-    auto blocksPerGrid = [&](uint32_t count)->uint32_t{return (count + (threadsPerBlock - 1)) / threadsPerBlock;};
-
-    auto nodeMgrHandle = cudaCreateNodeManager(d_grid, CudaDeviceBuffer(), stream);
-    auto *d_nodeMgr = nodeMgrHandle.template deviceMgr<BuildT>();
-
-    uint32_t nodeCount[3];// {leaf, lower, upper}
-    cudaCheck(cudaMemcpyAsync(nodeCount, (char*)d_grid + sizeof(GridData) + 4*sizeof(uint64_t), 3*sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
-    cudaStreamSynchronize(stream);// finish all device tasks in stream
-
-    StatsT *d_stats = nullptr;
-
-    if constexpr(StatsT::hasAverage()) cudaCheck(cudaMallocAsync((void**)&d_stats, nodeCount[0]*sizeof(StatsT), stream));
-
-    processLeaf<BuildT><<<blocksPerGrid(nodeCount[0]), threadsPerBlock, 0, stream>>>(d_nodeMgr, d_stats);
-
-    processInternal<BuildT, StatsT, 1><<<blocksPerGrid(nodeCount[1]), threadsPerBlock, 0, stream>>>(d_nodeMgr, d_stats);
-
-    processInternal<BuildT, StatsT, 2><<<blocksPerGrid(nodeCount[2]), threadsPerBlock, 0, stream>>>(d_nodeMgr, d_stats);
-
-    processRootAndGrid<BuildT><<<1, 1, 0, stream>>>(d_nodeMgr, d_stats);
-
-    if constexpr(StatsT::hasAverage()) cudaCheck(cudaFreeAsync(d_stats, stream));
-
-} // CudaGridStats::operator()( Grid )
-
-//================================================================================================
-
-template<typename BuildT>
-void cudaGridStats(NanoGrid<BuildT> *d_grid, StatsMode mode, cudaStream_t stream)
-{
-    if (d_grid == nullptr && mode == StatsMode::Disable) {
-        return;
-    } else if (mode == StatsMode::BBox || is_same<bool, BuildT>::value) {
-        CudaGridStats<BuildT, NoopStats<BuildT> > stats;
-        stats(d_grid, stream);
-    } else if (mode == StatsMode::MinMax) {
-        CudaGridStats<BuildT, Extrema<BuildT> > stats;
-        stats(d_grid, stream);
-    } else if (mode == StatsMode::All) {
-        CudaGridStats<BuildT, Stats<BuildT> > stats;
-        stats(d_grid, stream);
-    } else {
-        throw std::runtime_error("cudaGridStats: Unsupported statistics mode.");
-    }
-}// cudaGridStats
-
-} // namespace nanovdb
-
-#endif // NANOVDB_CUDAGRIDSTATS_CUH_HAS_BEEN_INCLUDED
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/cuda/GridStats.cuh>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/tools/cuda/GridStats.cuh instead.")
diff --git a/nanovdb/nanovdb/util/cuda/CudaGridValidator.cuh b/nanovdb/nanovdb/util/cuda/CudaGridValidator.cuh
new file mode 100644
index 0000000000..ca535d4013
--- /dev/null
+++ b/nanovdb/nanovdb/util/cuda/CudaGridValidator.cuh
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/cuda/GridValidator.cuh>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/tools/cuda/GridValidator.cuh instead.")
diff --git a/nanovdb/nanovdb/util/cuda/CudaIndexToGrid.cuh b/nanovdb/nanovdb/util/cuda/CudaIndexToGrid.cuh
index 8394ecefe1..ed5e67da33 100644
--- a/nanovdb/nanovdb/util/cuda/CudaIndexToGrid.cuh
+++ b/nanovdb/nanovdb/util/cuda/CudaIndexToGrid.cuh
@@ -1,386 +1,6 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-/*!
-    \file CudaIndexToGrid.cuh
-
-    \author Ken Museth
-
-    \date April 17, 2023
-
-    \brief Combines an IndexGrid and values into a regular Grid on the device
-
-    \warning The header file contains cuda device code so be sure
-             to only include it in .cu files (or other .cuh files)
-*/
-
-#ifndef NVIDIA_CUDA_INDEX_TO_GRID_CUH_HAS_BEEN_INCLUDED
-#define NVIDIA_CUDA_INDEX_TO_GRID_CUH_HAS_BEEN_INCLUDED
-
-#include <nanovdb/NanoVDB.h>
-#include "CudaDeviceBuffer.h"
-#include <nanovdb/util/GridHandle.h>
-#include <nanovdb/util/cuda/GpuTimer.h>
-#include <nanovdb/util/cuda/CudaUtils.h>
-
-namespace nanovdb {
-
-/// @brief Freestanding function that combines an IndexGrid and values into a regular Grid
-/// @tparam DstBuildT Build time of the destination/output Grid
-/// @tparam SrcBuildT  Build type of the source/input IndexGrid
-/// @tparam BufferT Type of the buffer used for allocation of the destination Grid
-/// @param d_srcGrid Device pointer to source/input IndexGrid, i.e. SrcBuildT={ValueIndex,ValueOnIndex,ValueIndexMask,ValueOnIndexMask}
-/// @param d_srcValues Device pointer to an array of values
-/// @param pool Memory pool used to create a buffer for the destination/output Grid
-/// @param stream optional CUDA stream (defaults to CUDA stream 0
-/// @note If d_srcGrid has stats (min,max,avg,std-div), the d_srcValues is also assumed
-///       to have the same information, all of which are then copied to the destination/output grid.
-///       An exception to this rule is if the type of d_srcValues is different from the stats type
-///       NanoRoot<DstBuildT>::FloatType, e.g. if DstBuildT=Vec3f then NanoRoot<DstBuildT>::FloatType=float,
-///       in which case average and standard-deviation is undefined in the output grid.
-/// @return
-template<typename DstBuildT, typename SrcBuildT, typename BufferT = CudaDeviceBuffer>
-typename enable_if<BuildTraits<SrcBuildT>::is_index, GridHandle<BufferT>>::type
-cudaIndexToGrid(const NanoGrid<SrcBuildT> *d_srcGrid, const typename BuildToValueMap<DstBuildT>::type *d_srcValues, const BufferT &pool = BufferT(), cudaStream_t stream = 0);
-
-
-template<typename DstBuildT, typename SrcBuildT, typename BufferT = CudaDeviceBuffer>
-typename enable_if<BuildTraits<SrcBuildT>::is_index, GridHandle<BufferT>>::type
-cudaCreateNanoGrid(const NanoGrid<SrcBuildT> *d_srcGrid, const typename BuildToValueMap<DstBuildT>::type *d_srcValues, const BufferT &pool = BufferT(), cudaStream_t stream = 0)
-{
-    return cudaIndexToGrid<DstBuildT, SrcBuildT, BufferT>(d_srcGrid, d_srcValues, pool, stream);
-}
-
-namespace {// anonymous namespace
-
-template<typename SrcBuildT>
-class CudaIndexToGrid
-{
-    using SrcGridT = NanoGrid<SrcBuildT>;
-public:
-    struct NodeAccessor;
-
-    /// @brief Constructor from a source IndeGrid
-    /// @param srcGrid Device pointer to IndexGrid used as the source
-    CudaIndexToGrid(const SrcGridT *d_srcGrid, cudaStream_t stream = 0);
-
-    ~CudaIndexToGrid() {cudaCheck(cudaFreeAsync(mDevNodeAcc, mStream));}
-
-    /// @brief Toggle on and off verbose mode
-    /// @param on if true verbose is turned on
-    void setVerbose(bool on = true) {mVerbose = on; }
-
-    /// @brief Set the name of the destination/output grid
-    /// @param name Name used for the destination grid
-    void setGridName(const std::string &name) {mGridName = name;}
-
-    /// @brief Combines the IndexGrid with values to produce a regular Grid
-    /// @tparam DstBuildT Template parameter of the destination grid and value type
-    /// @tparam BufferT Template parameter of the memory allocator
-    /// @param srcValues pointer to values that will be inserted into the output grid
-    /// @param buffer optional buffer used for memory allocation
-    /// @return A new GridHandle with the grid of type @c DstBuildT
-    template<typename DstBuildT, typename BufferT = CudaDeviceBuffer>
-    GridHandle<BufferT> getHandle(const typename BuildToValueMap<DstBuildT>::type *srcValues, const BufferT &buffer = BufferT());
-
-private:
-    cudaStream_t mStream{0};
-    GpuTimer mTimer;
-    std::string mGridName;
-    bool mVerbose{false};
-    NodeAccessor mNodeAcc, *mDevNodeAcc;
-
-    template<typename DstBuildT, typename BufferT>
-    BufferT getBuffer(const BufferT &pool);
-};// CudaIndexToGrid
-
-//================================================================================================
-
-template<typename SrcBuildT>
-struct CudaIndexToGrid<SrcBuildT>::NodeAccessor
-{
-    uint64_t grid, tree, root, node[3], meta, blind, size;// byte offsets, node: 0=leaf,1=lower, 2=upper
-    const SrcGridT *d_srcGrid;// device point to source IndexGrid
-    void *d_dstPtr;// device pointer to buffer with destination Grid
-    char *d_gridName;
-    uint32_t nodeCount[4];// 0=leaf, 1=lower, 2=upper, 3=root tiles
-
-    __device__ const NanoGrid<SrcBuildT>& srcGrid() const {return *d_srcGrid;}
-    __device__ const NanoTree<SrcBuildT>& srcTree() const {return d_srcGrid->tree();}
-    __device__ const NanoRoot<SrcBuildT>& srcRoot() const {return d_srcGrid->tree().root();}
-    template <int LEVEL>
-    __device__ const typename NanoNode<SrcBuildT, LEVEL>::type& srcNode(int i) const {
-        return *(this->srcTree().template getFirstNode<LEVEL>() + i);
-    }
-
-    template <typename DstBuildT>
-    __device__ NanoGrid<DstBuildT>& dstGrid() const {return *PtrAdd<NanoGrid<DstBuildT>>(d_dstPtr, grid);}
-    template <typename DstBuildT>
-    __device__ NanoTree<DstBuildT>& dstTree() const {return *PtrAdd<NanoTree<DstBuildT>>(d_dstPtr, tree);}
-    template <typename DstBuildT>
-    __device__ NanoRoot<DstBuildT>& dstRoot() const {return *PtrAdd<NanoRoot<DstBuildT>>(d_dstPtr, root);}
-    template <typename DstBuildT, int LEVEL>
-    __device__ typename NanoNode<DstBuildT, LEVEL>::type& dstNode(int i) const {
-        return *(PtrAdd<typename NanoNode<DstBuildT,LEVEL>::type>(d_dstPtr, node[LEVEL])+i);
-    }
-};// CudaIndexToGrid<SrcBuildT>::NodeAccessor
-
-//================================================================================================
-
-template<typename SrcBuildT, typename DstBuildT>
-__global__ void cudaProcessGridTreeRoot(typename CudaIndexToGrid<SrcBuildT>::NodeAccessor *nodeAcc,
-                                        const typename BuildToValueMap<DstBuildT>::type *srcValues)
-{
-    using SrcValueT = typename BuildToValueMap<DstBuildT>::type;
-    using DstStatsT = typename NanoRoot<DstBuildT>::FloatType;
-
-    auto &srcGrid = nodeAcc->srcGrid();
-    auto &dstGrid = nodeAcc->template dstGrid<DstBuildT>();
-    auto &srcTree = srcGrid.tree();
-    auto &dstTree = nodeAcc->template dstTree<DstBuildT>();
-    auto &srcRoot = srcTree.root();
-    auto &dstRoot = nodeAcc->template dstRoot<DstBuildT>();
-
-    // process Grid
-    *dstGrid.data() = *srcGrid.data();
-    dstGrid.mGridType = mapToGridType<DstBuildT>();
-    dstGrid.mData1 = 0u;
-    // we will recompute GridData::mChecksum later
-
-    // process Tree
-    *dstTree.data() = *srcTree.data();
-    dstTree.setRoot(&dstRoot);
-    dstTree.setFirstNode(&nodeAcc->template dstNode<DstBuildT,2>(0));
-    dstTree.setFirstNode(&nodeAcc->template dstNode<DstBuildT,1>(0));
-    dstTree.setFirstNode(&nodeAcc->template dstNode<DstBuildT,0>(0));
-
-    // process Root
-    dstRoot.mBBox = srcRoot.mBBox;
-    dstRoot.mTableSize = srcRoot.mTableSize;
-    dstRoot.mBackground = srcValues[srcRoot.mBackground];
-    if (srcGrid.hasMinMax()) {
-        dstRoot.mMinimum = srcValues[srcRoot.mMinimum];
-        dstRoot.mMaximum = srcValues[srcRoot.mMaximum];
-    }
-    if constexpr(is_same<SrcValueT, DstStatsT>::value) {// e.g. {float,float} or {Vec3f,float}
-        if (srcGrid.hasAverage())      dstRoot.mAverage = srcValues[srcRoot.mAverage];
-        if (srcGrid.hasStdDeviation()) dstRoot.mStdDevi = srcValues[srcRoot.mStdDevi];
-    }
-}// cudaProcessGridTreeRoot
-
-//================================================================================================
-
-template<typename SrcBuildT, typename DstBuildT>
-__global__ void cudaProcessRootTiles(typename CudaIndexToGrid<SrcBuildT>::NodeAccessor *nodeAcc,
-                                     const typename BuildToValueMap<DstBuildT>::type *srcValues)
-{
-    const auto tid = blockIdx.x;
-
-    // Process children and tiles
-    const auto &srcTile = *nodeAcc->srcRoot().tile(tid);
-    auto &dstTile = *nodeAcc->template dstRoot<DstBuildT>().tile(tid);
-    dstTile.key   = srcTile.key;
-    if (srcTile.child) {
-        dstTile.child = sizeof(NanoRoot<DstBuildT>) + sizeof(NanoRoot<DstBuildT>::Tile)*((srcTile.child - sizeof(NanoRoot<SrcBuildT>))/sizeof(NanoRoot<SrcBuildT>::Tile));
-        dstTile.value = srcValues[0];// set to background
-        dstTile.state = false;
-    } else {
-        dstTile.child = 0;// i.e. no child node
-        dstTile.value = srcValues[srcTile.value];
-        dstTile.state = srcTile.state;
-    }
-}// cudaProcessRootTiles
-
-//================================================================================================
-
-template<typename SrcBuildT, typename DstBuildT, int LEVEL>
-__global__ void cudaProcessInternalNodes(typename CudaIndexToGrid<SrcBuildT>::NodeAccessor *nodeAcc,
-                                         const typename BuildToValueMap<DstBuildT>::type *srcValues)
-{
-    using SrcNodeT  = typename NanoNode<SrcBuildT, LEVEL>::type;
-    using DstNodeT  = typename NanoNode<DstBuildT, LEVEL>::type;
-    using SrcChildT = typename SrcNodeT::ChildNodeType;
-    using DstChildT = typename DstNodeT::ChildNodeType;
-    using SrcValueT = typename BuildToValueMap<DstBuildT>::type;
-    using DstStatsT = typename NanoRoot<DstBuildT>::FloatType;
-
-    auto &srcNode = nodeAcc->template srcNode<LEVEL>(blockIdx.x);
-    auto &dstNode = nodeAcc->template dstNode<DstBuildT, LEVEL>(blockIdx.x);
-
-    if (threadIdx.x == 0 && threadIdx.y == 0) {
-        dstNode.mBBox = srcNode.mBBox;
-        dstNode.mFlags = srcNode.mFlags;
-        dstNode.mValueMask = srcNode.mValueMask;
-        dstNode.mChildMask = srcNode.mChildMask;
-        auto &srcGrid = nodeAcc->srcGrid();
-        if (srcGrid.hasMinMax()) {
-            dstNode.mMinimum = srcValues[srcNode.mMinimum];
-            dstNode.mMaximum = srcValues[srcNode.mMaximum];
-        }
-        if constexpr(is_same<SrcValueT, DstStatsT>::value) {// e.g. {float,float} or {Vec3f,float}
-            if (srcGrid.hasAverage())      dstNode.mAverage = srcValues[srcNode.mAverage];
-            if (srcGrid.hasStdDeviation()) dstNode.mStdDevi = srcValues[srcNode.mStdDevi];
-        }
-    }
-    const uint64_t nodeSkip = nodeAcc->nodeCount[LEVEL] - blockIdx.x, srcOff = sizeof(SrcNodeT)*nodeSkip, dstOff = sizeof(DstNodeT)*nodeSkip;// offset to first node of child type
-    const int off = blockDim.x*blockDim.y*threadIdx.x + blockDim.x*threadIdx.y;
-    for (int threadIdx_z=0; threadIdx_z<blockDim.x; ++threadIdx_z) {
-        const int i = off + threadIdx_z;
-        if (srcNode.mChildMask.isOn(i)) {
-            if constexpr(sizeof(SrcNodeT)==sizeof(DstNodeT) && sizeof(SrcChildT)==sizeof(DstChildT)) {
-                dstNode.mTable[i].child = srcNode.mTable[i].child;
-            } else {
-                const uint64_t childID = (srcNode.mTable[i].child - srcOff)/sizeof(SrcChildT);
-                dstNode.mTable[i].child = dstOff + childID*sizeof(DstChildT);
-            }
-        } else {
-            dstNode.mTable[i].value = srcValues[srcNode.mTable[i].value];
-        }
-    }
-}// cudaProcessInternalNodes
-
-//================================================================================================
-
-template<typename SrcBuildT, typename DstBuildT>
-__global__ void cudaProcessLeafNodes(typename CudaIndexToGrid<SrcBuildT>::NodeAccessor *nodeAcc,
-                                     const typename BuildToValueMap<DstBuildT>::type *srcValues)
-{
-    using SrcValueT = typename BuildToValueMap<DstBuildT>::type;
-    using DstStatsT = typename NanoRoot<DstBuildT>::FloatType;
-    static_assert(!BuildTraits<DstBuildT>::is_special, "Invalid destination type!");
-    auto &srcLeaf = nodeAcc->template srcNode<0>(blockIdx.x);
-    auto &dstLeaf = nodeAcc->template dstNode<DstBuildT,0>(blockIdx.x);
-    if (threadIdx.x == 0 && threadIdx.y == 0) {
-        dstLeaf.mBBoxMin = srcLeaf.mBBoxMin;
-        for (int i=0; i<3; ++i) dstLeaf.mBBoxDif[i] = srcLeaf.mBBoxDif[i];
-        dstLeaf.mFlags = srcLeaf.mFlags;
-        dstLeaf.mValueMask = srcLeaf.mValueMask;
-        ///
-        auto &srcGrid = nodeAcc->srcGrid();
-        if (srcGrid.hasMinMax()) {
-            dstLeaf.mMinimum = srcValues[srcLeaf.getMin()];
-            dstLeaf.mMaximum = srcValues[srcLeaf.getMax()];
-        }
-        if constexpr(is_same<SrcValueT, DstStatsT>::value) {// e.g. {float,float} or {Vec3f,float}
-            if (srcGrid.hasAverage())      dstLeaf.mAverage = srcValues[srcLeaf.getAvg()];
-            if (srcGrid.hasStdDeviation()) dstLeaf.mStdDevi = srcValues[srcLeaf.getDev()];
-        }
-    }
-    const int off = blockDim.x*blockDim.y*threadIdx.x + blockDim.x*threadIdx.y;
-    auto *dst = dstLeaf.mValues + off;
-    for (int threadIdx_z=0; threadIdx_z<blockDim.x; ++threadIdx_z) {
-        const int i = off + threadIdx_z;
-        *dst++ = srcValues[srcLeaf.getValue(i)];
-    }
-}// cudaProcessLeafNodes
-
-//================================================================================================
-
-template <typename SrcBuildT>
-__global__ void cudaCpyNodeCount(const NanoGrid<SrcBuildT> *srcGrid,
-                                 typename CudaIndexToGrid<SrcBuildT>::NodeAccessor *nodeAcc)
-{
-    assert(srcGrid->isSequential());
-    nodeAcc->d_srcGrid = srcGrid;
-    for (int i=0; i<3; ++i) nodeAcc->nodeCount[i] = srcGrid->tree().nodeCount(i);
-    nodeAcc->nodeCount[3] = srcGrid->tree().root().tileCount();
-}
-
-}// anonymous namespace
-
-//================================================================================================
-
-template <typename SrcBuildT>
-CudaIndexToGrid<SrcBuildT>::CudaIndexToGrid(const SrcGridT *d_srcGrid, cudaStream_t stream)
-    : mStream(stream), mTimer(stream)
-{
-    NANOVDB_ASSERT(d_srcGrid);
-    cudaCheck(cudaMallocAsync((void**)&mDevNodeAcc, sizeof(NodeAccessor), mStream));
-    cudaCpyNodeCount<SrcBuildT><<<1, 1, 0, mStream>>>(d_srcGrid, mDevNodeAcc);
-    cudaCheckError();
-    cudaCheck(cudaMemcpyAsync(&mNodeAcc, mDevNodeAcc, sizeof(NodeAccessor), cudaMemcpyDeviceToHost, mStream));// mNodeAcc = *mDevNodeAcc
-}
-
-//================================================================================================
-
-template <typename SrcBuildT>
-template <typename DstBuildT, typename BufferT>
-GridHandle<BufferT> CudaIndexToGrid<SrcBuildT>::getHandle(const typename BuildToValueMap<DstBuildT>::type *srcValues,
-                                                          const BufferT &pool)
-{
-    if (mVerbose) mTimer.start("Initiate buffer");
-    auto buffer = this->template getBuffer<DstBuildT, BufferT>(pool);
-
-    if (mVerbose) mTimer.restart("Process grid,tree,root");
-    cudaProcessGridTreeRoot<SrcBuildT,DstBuildT><<<1, 1, 0, mStream>>>(mDevNodeAcc, srcValues);
-    cudaCheckError();
-
-    if (mVerbose) mTimer.restart("Process root children and tiles");
-    cudaProcessRootTiles<SrcBuildT,DstBuildT><<<mNodeAcc.nodeCount[3], 1, 0, mStream>>>(mDevNodeAcc, srcValues);
-    cudaCheckError();
-
-    cudaCheck(cudaFreeAsync(mNodeAcc.d_gridName, mStream));
-
-    if (mVerbose) mTimer.restart("Process upper internal nodes");
-    cudaProcessInternalNodes<SrcBuildT,DstBuildT,2><<<mNodeAcc.nodeCount[2], dim3(32,32), 0, mStream>>>(mDevNodeAcc, srcValues);
-    cudaCheckError();
-
-    if (mVerbose) mTimer.restart("Process lower internal nodes");
-    cudaProcessInternalNodes<SrcBuildT,DstBuildT,1><<<mNodeAcc.nodeCount[1], dim3(16,16), 0, mStream>>>(mDevNodeAcc, srcValues);
-    cudaCheckError();
-
-    if (mVerbose) mTimer.restart("Process leaf nodes");
-    cudaProcessLeafNodes<SrcBuildT,DstBuildT><<<mNodeAcc.nodeCount[0], dim3(8,8), 0, mStream>>>(mDevNodeAcc, srcValues);
-    if (mVerbose) mTimer.stop();
-    cudaCheckError();
-
-    if (mVerbose) mTimer.restart("Compute checksums");
-    cudaUpdateGridChecksum((GridData*)mNodeAcc.d_dstPtr, mStream);
-    if (mVerbose) mTimer.stop();
-
-    cudaStreamSynchronize(mStream);// finish all device tasks in mStream
-    return GridHandle<BufferT>(std::move(buffer));
-}// CudaIndexToGrid::getHandle
-
-//================================================================================================
-
-template <typename SrcBuildT>
-template <typename DstBuildT, typename BufferT>
-inline BufferT CudaIndexToGrid<SrcBuildT>::getBuffer(const BufferT &pool)
-{
-    mNodeAcc.grid  = 0;// grid is always stored at the start of the buffer!
-    mNodeAcc.tree  = NanoGrid<DstBuildT>::memUsage(); // grid ends and tree begins
-    mNodeAcc.root  = mNodeAcc.tree  + NanoTree<DstBuildT>::memUsage(); // tree ends and root node begins
-    mNodeAcc.node[2] = mNodeAcc.root  + NanoRoot<DstBuildT>::memUsage(mNodeAcc.nodeCount[3]); // root node ends and upper internal nodes begin
-    mNodeAcc.node[1] = mNodeAcc.node[2] + NanoUpper<DstBuildT>::memUsage()*mNodeAcc.nodeCount[2]; // upper internal nodes ends and lower internal nodes begin
-    mNodeAcc.node[0] = mNodeAcc.node[1] + NanoLower<DstBuildT>::memUsage()*mNodeAcc.nodeCount[1]; // lower internal nodes ends and leaf nodes begin
-    mNodeAcc.meta  = mNodeAcc.node[0]  + NanoLeaf<DstBuildT>::DataType::memUsage()*mNodeAcc.nodeCount[0];// leaf nodes end and blind meta data begins
-    mNodeAcc.blind = mNodeAcc.meta  + 0*sizeof(GridBlindMetaData); // meta data ends and blind data begins
-    mNodeAcc.size  = mNodeAcc.blind;// end of buffer
-    auto buffer = BufferT::create(mNodeAcc.size, &pool, false, mStream);
-    mNodeAcc.d_dstPtr = buffer.deviceData();
-    if (mNodeAcc.d_dstPtr == nullptr) throw std::runtime_error("Failed memory allocation on the device");
-
-    if (size_t size = mGridName.size()) {
-        cudaCheck(cudaMallocAsync((void**)&mNodeAcc.d_gridName, size, mStream));
-        cudaCheck(cudaMemcpyAsync(mNodeAcc.d_gridName, mGridName.data(), size, cudaMemcpyHostToDevice, mStream));
-    } else {
-        mNodeAcc.d_gridName = nullptr;
-    }
-    cudaCheck(cudaMemcpyAsync(mDevNodeAcc, &mNodeAcc, sizeof(NodeAccessor), cudaMemcpyHostToDevice, mStream));// copy NodeAccessor CPU -> GPU
-    return buffer;
-}
-
-//================================================================================================
-
-template<typename DstBuildT, typename SrcBuildT, typename BufferT>
-typename enable_if<BuildTraits<SrcBuildT>::is_index, GridHandle<BufferT>>::type
-cudaIndexToGrid(const NanoGrid<SrcBuildT> *d_srcGrid, const typename BuildToValueMap<DstBuildT>::type *d_srcValues, const BufferT &pool, cudaStream_t stream)
-{
-    CudaIndexToGrid<SrcBuildT> converter(d_srcGrid, stream);
-    return converter.template getHandle<DstBuildT>(d_srcValues, pool);
-}
-
-}// nanovdb namespace
-
-#endif // NVIDIA_CUDA_INDEX_TO_GRID_CUH_HAS_BEEN_INCLUDED
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/cuda/IndexToGrid.cuh>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/tools/cuda/IndexToGrid.cuh instead.")
diff --git a/nanovdb/nanovdb/util/cuda/CudaNodeManager.cuh b/nanovdb/nanovdb/util/cuda/CudaNodeManager.cuh
index 3d35a4b902..5aa5b84965 100644
--- a/nanovdb/nanovdb/util/cuda/CudaNodeManager.cuh
+++ b/nanovdb/nanovdb/util/cuda/CudaNodeManager.cuh
@@ -1,90 +1,6 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-/*!
-    \file CudaNodeManager.cuh
-
-    \author Ken Museth
-
-    \date October 3, 2023
-
-    \brief Contains cuda kernels for NodeManager
-
-    \warning The header file contains cuda device code so be sure
-             to only include it in .cu files (or other .cuh files)
-*/
-
-#ifndef NANOVDB_CUDA_NODE_MANAGER_CUH_HAS_BEEN_INCLUDED
-#define NANOVDB_CUDA_NODE_MANAGER_CUH_HAS_BEEN_INCLUDED
-
-#include "CudaUtils.h"// for cudaLambdaKernel
-#include "CudaDeviceBuffer.h"
-#include "../NodeManager.h"
-
-namespace nanovdb {
-
-/// @brief Construct a NodeManager from a device grid pointer
-///
-/// @param d_grid device grid pointer whose nodes will be accessed sequentially
-/// @param buffer buffer from which to allocate the output handle
-/// @param stream cuda stream
-/// @return Handle that contains a device NodeManager
-template <typename BuildT, typename BufferT = CudaDeviceBuffer>
-inline typename enable_if<BufferTraits<BufferT>::hasDeviceDual, NodeManagerHandle<BufferT>>::type
-cudaCreateNodeManager(const NanoGrid<BuildT> *d_grid,
-                      const BufferT& pool = BufferT(),
-                      cudaStream_t stream = 0)
-{
-    auto buffer = BufferT::create(sizeof(NodeManagerData), &pool, false, stream);
-    auto *d_data = (NodeManagerData*)buffer.deviceData();
-    size_t size = 0u, *d_size;
-    cudaCheck(cudaMallocAsync((void**)&d_size, sizeof(size_t), stream));
-    cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {
-#ifdef NANOVDB_USE_NEW_MAGIC_NUMBERS
-        *d_data = NodeManagerData{NANOVDB_MAGIC_NODE,   0u, (void*)d_grid, {0u,0u,0u}};
-#else
-        *d_data = NodeManagerData{NANOVDB_MAGIC_NUMBER, 0u, (void*)d_grid, {0u,0u,0u}};
-#endif
-        *d_size = sizeof(NodeManagerData);
-        auto &tree = d_grid->tree();
-        if (NodeManager<BuildT>::FIXED_SIZE && d_grid->isBreadthFirst()) {
-            d_data->mLinear = uint8_t(1u);
-            d_data->mOff[0] = PtrDiff(tree.template getFirstNode<0>(), d_grid);
-            d_data->mOff[1] = PtrDiff(tree.template getFirstNode<1>(), d_grid);
-            d_data->mOff[2] = PtrDiff(tree.template getFirstNode<2>(), d_grid);
-        } else {
-            *d_size += sizeof(uint64_t)*tree.totalNodeCount();
-        }
-    });
-    cudaCheckError();
-    cudaCheck(cudaMemcpyAsync(&size, d_size, sizeof(size_t), cudaMemcpyDeviceToHost, stream));
-    cudaCheck(cudaFreeAsync(d_size, stream));
-    if (size > sizeof(NodeManagerData)) {
-        auto tmp = BufferT::create(size, &pool, false, stream);// only allocate buffer on the device
-        cudaCheck(cudaMemcpyAsync(tmp.deviceData(), buffer.deviceData(), sizeof(NodeManagerData), cudaMemcpyDeviceToDevice, stream));
-        buffer = std::move(tmp);
-        d_data = reinterpret_cast<NodeManagerData*>(buffer.deviceData());
-        cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__ (size_t) {
-            auto &tree = d_grid->tree();
-            int64_t *ptr0 = d_data->mPtr[0] = reinterpret_cast<int64_t*>(d_data + 1);
-            int64_t *ptr1 = d_data->mPtr[1] = d_data->mPtr[0] + tree.nodeCount(0);
-            int64_t *ptr2 = d_data->mPtr[2] = d_data->mPtr[1] + tree.nodeCount(1);
-            // Performs depth first traversal but breadth first insertion
-            for (auto it2 = tree.root().cbeginChild(); it2; ++it2) {
-                *ptr2++ = PtrDiff(&*it2, d_grid);
-                for (auto it1 = it2->cbeginChild(); it1; ++it1) {
-                    *ptr1++ = PtrDiff(&*it1, d_grid);
-                    for (auto it0 = it1->cbeginChild(); it0; ++it0) {
-                        *ptr0++ = PtrDiff(&*it0, d_grid);
-                    }// loop over child nodes of the lower internal node
-                }// loop over child nodes of the upper internal node
-            }// loop over child nodes of the root node
-        });
-    }
-
-    return NodeManagerHandle<BufferT>(mapToGridType<BuildT>(), std::move(buffer));
-}// cudaCreateNodeManager
-
-} // namespace nanovdb
-
-#endif // NANOVDB_CUDA_NODE_MANAGER_CUH_HAS_BEEN_INCLUDED
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/cuda/NodeManager.cuh>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/cuda/NodeManager.cuh instead.")
\ No newline at end of file
diff --git a/nanovdb/nanovdb/util/cuda/CudaPointsToGrid.cuh b/nanovdb/nanovdb/util/cuda/CudaPointsToGrid.cuh
index 733dc35cb9..91e7ad0b5c 100644
--- a/nanovdb/nanovdb/util/cuda/CudaPointsToGrid.cuh
+++ b/nanovdb/nanovdb/util/cuda/CudaPointsToGrid.cuh
@@ -1,1174 +1,6 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-/*!
-    \file CudaPointsToGrid.cuh
-
-    \authors Greg Klar (initial version) and Ken Museth (final version)
-
-    \brief Generates NanoVDB grids from a list of voxels or points on the device
-
-    \warning The header file contains cuda device code so be sure
-             to only include it in .cu files (or other .cuh files)
-*/
-
-#ifndef NVIDIA_CUDA_POINTS_TO_GRID_CUH_HAS_BEEN_INCLUDED
-#define NVIDIA_CUDA_POINTS_TO_GRID_CUH_HAS_BEEN_INCLUDED
-
-#include <cub/cub.cuh>
-#include <cub/util_allocator.cuh>
-#include <vector>
-#include <tuple>
-
-#include <nanovdb/NanoVDB.h>
-#include "CudaDeviceBuffer.h"
-#include <nanovdb/util/GridHandle.h>
-#include <nanovdb/util/cuda/CudaGridChecksum.cuh>
-#include <nanovdb/util/cuda/GpuTimer.h>
-#include <nanovdb/util/cuda/CudaUtils.h>
-
-/*
-   Note: 4.29 billion (=2^32) coordinates of type Vec3f have a memory footprint of 48 GB!
-*/
-
-namespace nanovdb {
-
-// Define the type used when the points are encoded as blind data in the output grid
-enum class PointType : uint32_t { Disable = 0,// no point information e.g. when BuildT != Point
-                                  PointID = 1,// linear index of type uint32_t to points
-                                  World64 = 2,// Vec3d in world space
-                                  World32 = 3,// Vec3f in world space
-                                  Grid64  = 4,// Vec3d in grid space
-                                  Grid32  = 5,// Vec3f in grid space
-                                  Voxel32 = 6,// Vec3f in voxel space
-                                  Voxel16 = 7,// Vec3u16 in voxel space
-                                  Voxel8  = 8,// Vec3u8 in voxel space
-                                  Default = 9,// output matches input, i.e. Vec3d or Vec3f in world space
-                                  End     =10 };
-
-//================================================================================================
-
-/// @brief Example class of a fancy pointer that can optionally be used as a template for writing
-///        a custom fancy pointer that allows for particle coordinates to be arrange non-linearly
-///        in memory. For instance with coordinates are interlaced with other dats, i.e. an array
-///        of structs, a custom implementation of fancy_ptr::operator[](size_t i) can account for
-///        strides that skip other interlaces data.
-/// @tparam T Template type that specifies the type use for the coordinates of the points
-template <typename T>
-class fancy_ptr
-{
-    const T* mPtr;
-public:
-    /// @brief Default constructor.
-    /// @note  This method is atcually not required by CudaPointsToGrid
-    /// @param ptr Pointer to array of elements
-    __hostdev__ explicit fancy_ptr(const T* ptr = nullptr) : mPtr(ptr) {}
-    /// @brief Index acces into the array pointed to by the stored pointer.
-    /// @note  This method is required by CudaPointsToGrid!
-    /// @param i Unsigned index of the element to be returned
-    /// @return Const refernce to the element at the i'th poisiton
-    __hostdev__ inline const T& operator[](size_t i) const {return mPtr[i];}
-    /// @brief Dummy implementation required by pointer_traits.
-    /// @note  Note that only the return type matters!
-    /// @details Unlike operator[] it is safe to assume that all pointer types have operator*,
-    ///          which is why pointer_traits makes use of it to determine the element_type that
-    ///          a pointer class is pointing to. E.g. operator[] is not always defined for std::shared_ptr!
-    __hostdev__ inline const T& operator*() const {return *mPtr;}
-};// fancy_ptr<T>
-
-/// @brief Simple stand-alone function that can be used to conveniently construct a fancy_ptr
-/// @tparam T Template type that specifies the type use for the coordinates of the points
-/// @param ptr Raw pointer to data
-/// @return a new instance of a fancy_ptr
-template <typename T>
-fancy_ptr<T> make_fancy(const T* ptr = nullptr) {return fancy_ptr<T>(ptr);}
-
-/// @brief Trait of points, like type of pointer and size of the pointer type
-template <typename>
-struct pointer_traits;
-
-template <typename T>
-struct pointer_traits<T*> {
-    using element_type = T;
-    static constexpr size_t element_size = sizeof(T);
-};
-
-template <typename T>
-struct pointer_traits {
-    using element_type = typename remove_reference<decltype(*std::declval<T>())>::type;// assumes T::operator*() exists!
-    static constexpr size_t element_size = sizeof(element_type);
-};
-
-//================================================================================================
-
-/// @brief Generates a NanoGrid<Point> from a list of point coordinates on the device. This method is
-///        mainly used as a means to build a BVH acceleration structure for points, e.g. for efficient rendering.
-/// @tparam PtrT Template type to a raw or fancy-pointer of point coordinates in world space. Dereferencing should return Vec3f or Vec3d.
-/// @tparam BufferT Template type of buffer used for memory allocation on the device
-/// @tparam AllocT  Template type of optional device allocator for internal temporary memory
-/// @param dWorldPoints Raw or fancy pointer to list of point coordinates in world space on the device
-/// @param pointCount number of point in the list @c d_world
-/// @param voxelSize Size of a voxel in world units used for the output grid
-/// @param type Defined the way point information is represented in the output grid (see PointType enum above)
-///             Should not be PointType::Disable!
-/// @param buffer Instance of the device buffer used for memory allocation
-/// @param stream optional CUDA stream (defaults to CUDA stream 0)
-/// @return Returns a handle with a grid of type NanoGrid<Point> where point information, e.g. coordinates,
-///         are represented as blind data defined by @c type.
-template<typename PtrT, typename BufferT = CudaDeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
-GridHandle<BufferT>
-cudaPointsToGrid(const PtrT dWorldPoints,
-                 int pointCount,
-                 double voxelSize = 1.0,
-                 PointType type = PointType::Default,
-                 const BufferT &buffer = BufferT(),
-                 cudaStream_t stream = 0);
-
-//================================================================================================
-
-template<typename BuildT, typename PtrT, typename BufferT = CudaDeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
-GridHandle<BufferT>
-cudaPointsToGrid(std::vector<std::tuple<const PtrT,size_t,double,PointType>> pointSet,
-                 const BufferT &buffer = BufferT(),
-                 cudaStream_t stream = 0);
-
-//================================================================================================
-
-/// @brief Generates a NanoGrid of any type from a list of voxel coordinates on the device. Unlike @c cudaPointsToGrid
-///        this method only builds the grid but does not encode the coordinates as blind data. It is mainly useful as a
-///        means to generate a grid that is know to contain the voxels given in the list.
-/// @tparam BuildT Template type of the return grid
-/// @tparam PtrT Template type to a raw or fancy-pointer of point coordinates in world space. Dereferencing should return Vec3f or Vec3d.
-/// @tparam BufferT Template type of buffer used for memory allocation on the device
-/// @tparam AllocT  Template type of optional device allocator for internal temporary memory
-/// @param dGridVoxels Raw or fancy pointer to list of voxel coordinates in grid (or index) space on the device
-/// @param pointCount number of voxel in the list @c dGridVoxels
-/// @param voxelSize Size of a voxel in world units used for the output grid
-/// @param buffer Instance of the device buffer used for memory allocation
-/// @return Returns a handle with the grid of type NanoGrid<BuildT>
-template<typename BuildT, typename PtrT, typename BufferT = CudaDeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
-GridHandle<BufferT>
-cudaVoxelsToGrid(const PtrT dGridVoxels,
-                 size_t voxelCount,
-                 double voxelSize = 1.0,
-                 const BufferT &buffer = BufferT(),
-                 cudaStream_t stream = 0);
-
-//================================================================================================
-
-template<typename BuildT, typename PtrT, typename BufferT = CudaDeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
-GridHandle<BufferT>
-cudaVoxelsToGrid(std::vector<std::tuple<const PtrT, size_t, double>> pointSet,
-                 const BufferT &buffer = BufferT(),
-                 cudaStream_t stream = 0);
-
-//================================================================================================
-
-template <typename Vec3T>
-__hostdev__ inline static void worldToVoxel(Vec3u8 &voxel, const Vec3T &world, const Map &map)
-{
-    const Vec3d ijk = map.applyInverseMap(world);// world -> index
-    static constexpr double encode = double((1<<8) - 1);
-    voxel[0] = uint8_t( encode*(ijk[0] - Floor(ijk[0] + 0.5) + 0.5) );
-    voxel[1] = uint8_t( encode*(ijk[1] - Floor(ijk[1] + 0.5) + 0.5) );
-    voxel[2] = uint8_t( encode*(ijk[2] - Floor(ijk[2] + 0.5) + 0.5) );
-}
-
-template <typename Vec3T>
-__hostdev__ inline static void worldToVoxel(Vec3u16 &voxel, const Vec3T &world, const Map &map)
-{
-    const Vec3d ijk = map.applyInverseMap(world);// world -> index
-    static constexpr double encode = double((1<<16) - 1);
-    voxel[0] = uint16_t( encode*(ijk[0] - Floor(ijk[0] + 0.5) + 0.5) );
-    voxel[1] = uint16_t( encode*(ijk[1] - Floor(ijk[1] + 0.5) + 0.5) );
-    voxel[2] = uint16_t( encode*(ijk[2] - Floor(ijk[2] + 0.5) + 0.5) );
-}
-
-template <typename Vec3T>
-__hostdev__ inline static void worldToVoxel(Vec3f &voxel, const Vec3T &world, const Map &map)
-{
-    const Vec3d ijk = map.applyInverseMap(world);// world -> index
-    voxel[0] = float( ijk[0] - Floor(ijk[0] + 0.5) );
-    voxel[1] = float( ijk[1] - Floor(ijk[1] + 0.5) );
-    voxel[2] = float( ijk[2] - Floor(ijk[2] + 0.5) );
-}
-
-//================================================================================================
-
-template <typename Vec3T = Vec3d>
-__hostdev__ inline static Vec3T voxelToWorld(const Vec3u8 &voxel, const Coord &ijk, const Map &map)
-{
-    static constexpr double decode = 1.0/double((1<<8) - 1);
-    if constexpr(is_same<Vec3T,Vec3d>::value) {
-        return map.applyMap( Vec3d(ijk[0] + decode*voxel[0] - 0.5, ijk[1] + decode*voxel[1] - 0.5, ijk[2] + decode*voxel[2] - 0.5));
-    } else {
-        return map.applyMapF(Vec3f(ijk[0] + decode*voxel[0] - 0.5f, ijk[1] + decode*voxel[1] - 0.5f, ijk[2] + decode*voxel[2] - 0.5f));
-    }
-}
-
-template <typename Vec3T = Vec3d>
-__hostdev__ inline static Vec3T voxelToWorld(const Vec3u16 &voxel, const Coord &ijk, const Map &map)
-{
-    static constexpr double decode = 1.0/double((1<<16) - 1);
-    if constexpr(is_same<Vec3T,Vec3d>::value) {
-        return map.applyMap( Vec3d(ijk[0] + decode*voxel[0] - 0.5, ijk[1] + decode*voxel[1] - 0.5, ijk[2] + decode*voxel[2] - 0.5));
-    } else {
-        return map.applyMapF(Vec3f(ijk[0] + decode*voxel[0] - 0.5f, ijk[1] + decode*voxel[1] - 0.5f, ijk[2] + decode*voxel[2] - 0.5f));
-    }
-}
-
-template <typename Vec3T = Vec3d>
-__hostdev__ inline static Vec3T voxelToWorld(const Vec3f &voxel, const Coord &ijk, const Map &map)
-{
-    if constexpr(is_same<Vec3T,Vec3d>::value) {
-        return map.applyMap( Vec3d(ijk[0] + voxel[0], ijk[1] + voxel[1], ijk[2] + voxel[2]));
-    } else {
-        return map.applyMapF(Vec3f(ijk[0] + voxel[0], ijk[1] + voxel[1], ijk[2] + voxel[2]));
-    }
-}
-
-//================================================================================================
-
-namespace {// anonymous namespace
-
-template <typename BuildT, typename AllocT = cub::CachingDeviceAllocator>
-class CudaPointsToGrid
-{
-public:
-
-    struct Data {
-        Map map;
-        void     *d_bufferPtr;
-        uint64_t *d_keys, *d_tile_keys, *d_lower_keys, *d_leaf_keys;// device pointer to 64 bit keys
-        uint64_t  grid, tree, root, upper, lower, leaf, meta, blind, size;// byte offsets to nodes in buffer
-        uint32_t *d_indx;// device pointer to point indices (or IDs)
-        uint32_t  nodeCount[3], *pointsPerLeafPrefix, *pointsPerLeaf;// 0=leaf,1=lower, 2=upper
-        uint32_t  voxelCount,  *pointsPerVoxelPrefix, *pointsPerVoxel;
-        BitFlags<16> flags;
-        __hostdev__ NanoGrid<BuildT>&  getGrid() const {return *PtrAdd<NanoGrid<BuildT>>(d_bufferPtr, grid);}
-        __hostdev__ NanoTree<BuildT>&  getTree() const {return *PtrAdd<NanoTree<BuildT>>(d_bufferPtr, tree);}
-        __hostdev__ NanoRoot<BuildT>&  getRoot() const {return *PtrAdd<NanoRoot<BuildT>>(d_bufferPtr, root);}
-        __hostdev__ NanoUpper<BuildT>& getUpper(int i) const {return *(PtrAdd<NanoUpper<BuildT>>(d_bufferPtr, upper)+i);}
-        __hostdev__ NanoLower<BuildT>& getLower(int i) const {return *(PtrAdd<NanoLower<BuildT>>(d_bufferPtr, lower)+i);}
-        __hostdev__ NanoLeaf<BuildT>&  getLeaf(int i) const {return *(PtrAdd<NanoLeaf<BuildT>>(d_bufferPtr, leaf)+i);}
-        __hostdev__ GridBlindMetaData& getMeta() const { return *PtrAdd<GridBlindMetaData>(d_bufferPtr, meta);};
-         template <typename Vec3T>
-        __hostdev__ Vec3T& getPoint(int i) const {return *(PtrAdd<Vec3T>(d_bufferPtr, blind)+i);}
-    };// Data
-
-    /// @brief Constructor from a Map
-    /// @param map Map to be used for the output device grid
-    /// @param stream optional CUDA stream (defaults to CUDA stream 0)
-    CudaPointsToGrid(const Map &map, cudaStream_t stream = 0)
-        : mStream(stream)
-        , mPointType(is_same<BuildT,Point>::value ? PointType::Default : PointType::Disable)
-    {
-        mData.map = map;
-        mData.flags.initMask({GridFlags::HasBBox, GridFlags::IsBreadthFirst});
-        cudaCheck(cudaMallocAsync((void**)&mDeviceData, sizeof(Data), mStream));
-    }
-
-    /// @brief Default constructor
-    /// @param scale Voxel size in world units
-    /// @param trans Translation of origin in world units
-    /// @param stream optional CUDA stream (defaults to CUDA stream 0)
-    CudaPointsToGrid(const double scale = 1.0, const Vec3d &trans = Vec3d(0.0), cudaStream_t stream = 0)
-        : CudaPointsToGrid(Map(scale, trans), stream) {}
-
-    /// @brief Destructor
-    ~CudaPointsToGrid() {cudaCheck(cudaFreeAsync(mDeviceData, mStream));}
-
-    /// @brief Toggle on and off verbose mode
-    /// @param level Verbose level: 0=quiet, 1=timing, 2=benchmarking
-    void setVerbose(int level = 1) {mVerbose = level; mData.flags.setBit(7u, level); }
-
-    /// @brief Set the mode for checksum computation, which is disabled by default
-    /// @param mode Mode of checksum computation
-    void setChecksum(ChecksumMode mode = ChecksumMode::Disable){mChecksum = mode;}
-
-    /// @brief Toggle on and off the computation of a bounding-box
-    /// @param on If true bbox will be computed
-    void includeBBox(bool on = true) { mData.flags.setMask(GridFlags::HasBBox, on); }
-
-    /// @brief Set the name of the output grid
-    /// @param name name of the output grid
-    void setGridName(const std::string &name) {mGridName = name;}
-
-    // only available when BuildT == Point
-    template <typename T = BuildT> typename enable_if<is_same<T, Point>::value>::type
-    setPointType(PointType type) { mPointType = type; }
-
-    /// @brief Creates a handle to a grid with the specified build type from a list of points in index or world space
-    /// @tparam BuildT Build type of the output grid, i.e NanoGrid<BuildT>
-    /// @tparam PtrT Template type to a raw or fancy-pointer of point coordinates in world or index space.
-    /// @tparam BufferT Buffer type used for allocation of the grid handle
-    /// @param points device point to an array of points in world space
-    /// @param pointCount number of input points or voxels
-    /// @param gridName optional name of the output grid
-    /// @param buffer optional buffer (currently ignored)
-    /// @return returns a handle with a grid of type NanoGrid<BuildT>
-    template<typename PtrT, typename BufferT = CudaDeviceBuffer>
-    GridHandle<BufferT> getHandle(const PtrT points,
-                                  size_t pointCount,
-                                  const BufferT &buffer = BufferT());
-
-    template <typename PtrT>
-    void countNodes(const PtrT points, size_t pointCount);
-
-    template <typename PtrT>
-    void processGridTreeRoot(const PtrT points, size_t pointCount);
-
-    void processUpperNodes();
-
-    void processLowerNodes();
-
-    template <typename PtrT>
-    void processLeafNodes(const PtrT points);
-
-    template <typename PtrT>
-    void processPoints(const PtrT points, size_t pointCount);
-
-    void processBBox();
-
-    // the following methods are only defined when BuildT == Point
-    template <typename T = BuildT> typename enable_if<is_same<T, Point>::value, uint32_t>::type
-    maxPointsPerVoxel() const {return mMaxPointsPerVoxel;}
-    template <typename T = BuildT> typename enable_if<is_same<T, Point>::value, uint32_t>::type
-    maxPointsPerLeaf()  const {return mMaxPointsPerLeaf;}
-
-private:
-    static constexpr unsigned int mNumThreads = 128;// seems faster than the old value of 256!
-    static unsigned int numBlocks(unsigned int n) {return (n + mNumThreads - 1) / mNumThreads;}
-
-    cudaStream_t mStream{0};
-    GpuTimer     mTimer;
-    PointType    mPointType;
-    std::string  mGridName;
-    int          mVerbose{0};
-    Data         mData, *mDeviceData;
-    uint32_t     mMaxPointsPerVoxel{0u}, mMaxPointsPerLeaf{0u};
-    ChecksumMode mChecksum{ChecksumMode::Disable};
-
-    // wrapper of cub::CachingDeviceAllocator with a shared scratch space
-    struct Allocator {
-        AllocT mAllocator;
-        void* d_scratch;
-        size_t scratchSize, actualScratchSize;
-        Allocator() : d_scratch(nullptr), scratchSize(0), actualScratchSize(0) {}
-        ~Allocator() {
-            if (scratchSize > 0) this->free(d_scratch);// a bug in cub makes this necessary
-            mAllocator.FreeAllCached();
-        }
-        template <typename T>
-        T* alloc(size_t count, cudaStream_t stream) {
-            T* d_ptr = nullptr;
-            cudaCheck(mAllocator.DeviceAllocate((void**)&d_ptr, sizeof(T)*count, stream));
-            return d_ptr;
-        }
-        void free(void *d_ptr) {if (d_ptr) cudaCheck(mAllocator.DeviceFree(d_ptr));}
-        template<class... T>
-        void free(void *d_ptr, T... other) {
-            if (d_ptr) cudaCheck(mAllocator.DeviceFree(d_ptr));
-            this->free(other...);
-        }
-        void adjustScratch(cudaStream_t stream){
-            if (scratchSize > actualScratchSize) {
-                if (actualScratchSize>0) cudaCheck(mAllocator.DeviceFree(d_scratch));
-                cudaCheck(mAllocator.DeviceAllocate((void**)&d_scratch, scratchSize, stream));
-                actualScratchSize = scratchSize;
-            }
-        }
-    } mMemPool;
-
-    template<typename PtrT, typename BufferT>
-    BufferT getBuffer(const PtrT points, size_t pointCount, const BufferT &buffer);
-};// CudaPointsToGrid<BuildT>
-
-
-namespace kernels {
-/// @details Used by CudaPointsToGrid<BuildT>::processLeafNodes before the computation
-/// of prefix-sum for index grid.
-/// Moving this away from an implementation using the cudaLambdaKernel wrapper
-/// to fix the following on Windows platform:
-/// error : For this host platform/dialect, an extended lambda cannot be defined inside the 'if'
-/// or 'else' block of a constexpr if statement.
-/// function in a lambda through cudaLambdaKernel wrapper defined in CudaUtils.h.
-template <typename BuildT, typename AllocT = cub::CachingDeviceAllocator>
-__global__ void fillValueIndexKernel(const size_t numItems, uint64_t* devValueIndex, typename CudaPointsToGrid<BuildT, AllocT>::Data* d_data) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= numItems)
-        return;
-
-    devValueIndex[tid] = static_cast<uint64_t>(d_data->getLeaf(tid).mValueMask.countOn());
-}
-
-/// @details Used by CudaPointsToGrid<BuildT>::processLeafNodes for the computation
-/// of prefix-sum for index grid.
-/// Moving this away from an implementation using the cudaLambdaKernel wrapper
-/// to fix the following on Windows platform:
-/// error : For this host platform/dialect, an extended lambda cannot be defined inside the 'if'
-/// or 'else' block of a constexpr if statement.
-template <typename BuildT, typename AllocT = cub::CachingDeviceAllocator>
-__global__ void leafPrefixSumKernel(const size_t numItems, uint64_t* devValueIndexPrefix, typename CudaPointsToGrid<BuildT, AllocT>::Data* d_data) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= numItems)
-        return;
-
-    auto &leaf = d_data->getLeaf(tid);
-    leaf.mOffset = 1u;// will be re-set below
-    const uint64_t *w = leaf.mValueMask.words();
-    uint64_t &prefixSum = leaf.mPrefixSum, sum = CountOn(*w++);
-    prefixSum = sum;
-    for (int n = 9; n < 55; n += 9) {// n=i*9 where i=1,2,..6
-        sum += CountOn(*w++);
-        prefixSum |= sum << n;// each pre-fixed sum is encoded in 9 bits
-    }
-    if (tid==0) {
-        d_data->getGrid().mData1 = 1u + devValueIndexPrefix[d_data->nodeCount[0]-1];// set total count
-        d_data->getTree().mVoxelCount = devValueIndexPrefix[d_data->nodeCount[0]-1];
-    } else {
-        leaf.mOffset = 1u + devValueIndexPrefix[tid-1];// background is index 0
-    }
-}
-
-/// @details Used by CudaPointsToGrid<BuildT>::processLeafNodes to make sure leaf.mMask - leaf.mValueMask.
-/// Moving this away from an implementation using the cudaLambdaKernel wrapper
-/// to fix the following on Windows platform:
-/// error : For this host platform/dialect, an extended lambda cannot be defined inside the 'if'
-/// or 'else' block of a constexpr if statement.
-template <typename BuildT, typename AllocT = cub::CachingDeviceAllocator>
-__global__ void setMaskEqValMaskKernel(const size_t numItems, typename CudaPointsToGrid<BuildT, AllocT>::Data* d_data) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= numItems)
-        return;
-
-    auto &leaf = d_data->getLeaf(tid);
-    leaf.mMask = leaf.mValueMask;
-}
-} // namespace kernels
-
-
-//================================================================================================
-
-// Define utility macro used to call cub functions that use dynamic temporary storage
-#ifndef CALL_CUBS
-#ifdef _WIN32
-#define CALL_CUBS(func, ...) \
-    cudaCheck(cub::func(nullptr, mMemPool.scratchSize, __VA_ARGS__, mStream)); \
-    mMemPool.adjustScratch(mStream); \
-    cudaCheck(cub::func(mMemPool.d_scratch, mMemPool.scratchSize, __VA_ARGS__, mStream));
-#else// fdef _WIN32
-#define CALL_CUBS(func, args...) \
-    cudaCheck(cub::func(nullptr, mMemPool.scratchSize, args, mStream)); \
-    mMemPool.adjustScratch(mStream); \
-    cudaCheck(cub::func(mMemPool.d_scratch, mMemPool.scratchSize, args, mStream));
-#endif// ifdef _WIN32
-#endif// ifndef CALL_CUBS
-
-}// anonymous namespace
-
-//================================================================================================
-
-template<typename BuildT, typename AllocT>
-template<typename PtrT, typename BufferT>
-inline GridHandle<BufferT>
-CudaPointsToGrid<BuildT, AllocT>::getHandle(const PtrT points,
-                                            size_t pointCount,
-                                            const BufferT &pool)
-{
-    if (mVerbose==1) mTimer.start("\nCounting nodes");
-    this->countNodes(points, pointCount);
-
-    if (mVerbose==1) mTimer.restart("Initiate buffer");
-    auto buffer = this->getBuffer(points, pointCount, pool);
-
-    if (mVerbose==1) mTimer.restart("Process grid,tree,root");
-    this->processGridTreeRoot(points, pointCount);
-
-    if (mVerbose==1) mTimer.restart("Process upper nodes");
-    this->processUpperNodes();
-
-    if (mVerbose==1) mTimer.restart("Process lower nodes");
-    this->processLowerNodes();
-
-    if (mVerbose==1) mTimer.restart("Process leaf nodes");
-    this->processLeafNodes(points);
-
-    if (mVerbose==1) mTimer.restart("Process points");
-    this->processPoints(points, pointCount);
-
-    if (mVerbose==1) mTimer.restart("Process bbox");
-    this->processBBox();
-    if (mVerbose==1) mTimer.stop();
-
-    if (mChecksum != ChecksumMode::Disable) {
-        if (mVerbose==1) mTimer.restart("Computation of checksum");
-            cudaGridChecksum((GridData*)buffer.deviceData(), mChecksum);
-        if (mVerbose==1) mTimer.stop();
-    }
-
-    cudaStreamSynchronize(mStream);// finish all device tasks in mStream
-
-    return GridHandle<BufferT>(std::move(buffer));
-}// CudaPointsToGrid<BuildT>::getHandle
-
-//================================================================================================
-
-// --- CUB helpers ---
-template<uint8_t BitCount, typename InT, typename OutT>
-struct ShiftRight
-{
-    __hostdev__ inline OutT operator()(const InT& v) const {return static_cast<OutT>(v >> BitCount);}
-};
-
-template<uint8_t BitCount, typename InT = uint64_t, typename OutT = uint64_t>
-struct ShiftRightIterator : public cub::TransformInputIterator<OutT, ShiftRight<BitCount, InT, OutT>, InT*>
-{
-    using BASE = cub::TransformInputIterator<OutT, ShiftRight<BitCount, InT, OutT>, InT*>;
-    __hostdev__ inline ShiftRightIterator(uint64_t* input_itr) : BASE(input_itr, ShiftRight<BitCount, InT, OutT>()) {}
-};
-
-//================================================================================================
-
-template <typename BuildT, typename AllocT>
-template <typename PtrT>
-void CudaPointsToGrid<BuildT, AllocT>::countNodes(const PtrT points, size_t pointCount)
-{
-    using Vec3T = typename remove_const<typename pointer_traits<PtrT>::element_type>::type;
-    if constexpr(is_same<BuildT, Point>::value) {
-        static_assert(is_same<Vec3T, Vec3f, Vec3d>::value, "Point (vs voxels) coordinates should be represented as Vec3f or Vec3d");
-    } else {
-        static_assert(is_same<Vec3T, Coord, Vec3f, Vec3d>::value, "Voxel coordinates should be represented as Coord, Vec3f or Vec3d");
-    }
-
-    mData.d_keys = mMemPool.template alloc<uint64_t>(pointCount, mStream);
-    mData.d_indx = mMemPool.template alloc<uint32_t>(pointCount, mStream);// uint32_t can index 4.29 billion Coords, corresponding to 48 GB
-    cudaCheck(cudaMemcpyAsync(mDeviceData, &mData, sizeof(Data), cudaMemcpyHostToDevice, mStream));// copy mData from CPU -> GPU
-
-    if (mVerbose==2) mTimer.start("\nAllocating arrays for keys and indices");
-    auto *d_keys = mMemPool.template alloc<uint64_t>(pointCount, mStream);
-    auto *d_indx = mMemPool.template alloc<uint32_t>(pointCount, mStream);
-
-    if (mVerbose==2) mTimer.restart("Generate tile keys");
-    cudaLambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, const Data *d_data, const PtrT points) {
-        auto coordToKey = [](const Coord &ijk)->uint64_t{
-            // Note: int32_t has a range of -2^31 to 2^31 - 1 whereas uint32_t has a range of 0 to 2^32 - 1
-            static constexpr int64_t offset = 1 << 31;
-            return (uint64_t(uint32_t(int64_t(ijk[2]) + offset) >> 12)      ) | // z is the lower 21 bits
-                   (uint64_t(uint32_t(int64_t(ijk[1]) + offset) >> 12) << 21) | // y is the middle 21 bits
-                   (uint64_t(uint32_t(int64_t(ijk[0]) + offset) >> 12) << 42); //  x is the upper 21 bits
-        };// coordToKey lambda functor
-        d_indx[tid] = uint32_t(tid);
-        uint64_t &key = d_keys[tid];
-        if constexpr(is_same<BuildT, Point>::value) {// points are in world space
-            if constexpr(is_same<Vec3T, Vec3f>::value) {
-                key = coordToKey(d_data->map.applyInverseMapF(points[tid]).round());
-            } else {// points are Vec3d
-                key = coordToKey(d_data->map.applyInverseMap(points[tid]).round());
-            }
-        } else if constexpr(is_same<Vec3T, Coord>::value) {// points Coord are in index space
-            key = coordToKey(points[tid]);
-        } else {// points are Vec3f or Vec3d in index space
-            key = coordToKey(points[tid].round());
-        }
-    }, mDeviceData, points);
-    cudaCheckError();
-    if (mVerbose==2) mTimer.restart("DeviceRadixSort of "+std::to_string(pointCount)+" tile keys");
-    CALL_CUBS(DeviceRadixSort::SortPairs, d_keys, mData.d_keys, d_indx, mData.d_indx, pointCount, 0, 62);// 21 bits per coord
-    std::swap(d_indx, mData.d_indx);// sorted indices are now in d_indx
-
-    if (mVerbose==2) mTimer.restart("Allocate runs");
-    auto *d_points_per_tile = mMemPool.template alloc<uint32_t>(pointCount, mStream);
-    uint32_t *d_node_count  = mMemPool.template alloc<uint32_t>(3, mStream);
-
-    if (mVerbose==2) mTimer.restart("DeviceRunLengthEncode tile keys");
-    CALL_CUBS(DeviceRunLengthEncode::Encode, mData.d_keys, d_keys, d_points_per_tile, d_node_count+2, pointCount);
-    cudaCheck(cudaMemcpyAsync(mData.nodeCount+2, d_node_count+2, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
-    mData.d_tile_keys = mMemPool.template alloc<uint64_t>(mData.nodeCount[2], mStream);
-    cudaCheck(cudaMemcpyAsync(mData.d_tile_keys, d_keys, mData.nodeCount[2]*sizeof(uint64_t), cudaMemcpyDeviceToDevice, mStream));
-
-    if (mVerbose) mTimer.restart("DeviceRadixSort of " + std::to_string(pointCount) + " voxel keys in " + std::to_string(mData.nodeCount[2]) + " tiles");
-    uint32_t *points_per_tile = new uint32_t[mData.nodeCount[2]];
-    cudaCheck(cudaMemcpyAsync(points_per_tile, d_points_per_tile, mData.nodeCount[2]*sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
-    mMemPool.free(d_points_per_tile);
-
-    for (uint32_t id = 0, offset = 0; id < mData.nodeCount[2]; ++id) {
-        const uint32_t count = points_per_tile[id];
-        cudaLambdaKernel<<<numBlocks(count), mNumThreads, 0, mStream>>>(count, [=] __device__(size_t tid, const Data *d_data) {
-            auto voxelKey = [] __device__ (uint64_t tileID, const Coord &ijk){
-                return tileID << 36 |                                       // upper offset: 64-15-12-9=28, i.e. last 28 bits
-                    uint64_t(NanoUpper<BuildT>::CoordToOffset(ijk)) << 21 | // lower offset: 32^3 = 2^15,   i.e. next 15 bits
-                    uint64_t(NanoLower<BuildT>::CoordToOffset(ijk)) <<  9 | // leaf  offset: 16^3 = 2^12,   i.e. next 12 bits
-                    uint64_t(NanoLeaf< BuildT>::CoordToOffset(ijk));        // voxel offset:  8^3 =  2^9,   i.e. first 9 bits
-            };// voxelKey lambda functor
-            tid += offset;
-            Vec3T p = points[d_indx[tid]];
-            if constexpr(is_same<BuildT, Point>::value) p = is_same<Vec3T, Vec3f>::value ? d_data->map.applyInverseMapF(p) : d_data->map.applyInverseMap(p);
-            d_keys[tid] = voxelKey(id, p.round());
-        }, mDeviceData); cudaCheckError();
-        CALL_CUBS(DeviceRadixSort::SortPairs, d_keys + offset, mData.d_keys + offset, d_indx + offset, mData.d_indx + offset, count, 0, 36);// 9+12+15=36
-        offset += count;
-    }
-    mMemPool.free(d_indx);
-    delete [] points_per_tile;
-
-    if (mVerbose==2) mTimer.restart("Count points per voxel");
-
-    mData.pointsPerVoxel    = mMemPool.template alloc<uint32_t>(pointCount, mStream);
-    uint32_t *d_voxel_count = mMemPool.template alloc<uint32_t>(1, mStream);
-    CALL_CUBS(DeviceRunLengthEncode::Encode, mData.d_keys, d_keys, mData.pointsPerVoxel, d_voxel_count, pointCount);
-    cudaCheck(cudaMemcpyAsync(&mData.voxelCount, d_voxel_count, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
-    mMemPool.free(d_voxel_count);
-
-    if constexpr(is_same<BuildT, Point>::value) {
-        if (mVerbose==2) mTimer.restart("Count max points per voxel");
-        uint32_t *d_maxPointsPerVoxel = mMemPool.template alloc<uint32_t>(1, mStream);
-        CALL_CUBS(DeviceReduce::Max, mData.pointsPerVoxel, d_maxPointsPerVoxel, mData.voxelCount);
-        cudaCheck(cudaMemcpyAsync(&mMaxPointsPerVoxel, d_maxPointsPerVoxel, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
-        mMemPool.free(d_maxPointsPerVoxel);
-    }
-
-    //printf("\n Active voxel count = %u, max points per voxel = %u\n", mData.voxelCount, mMaxPointsPerVoxel);
-    if (mVerbose==2) mTimer.restart("Compute prefix sum of points per voxel");
-    mData.pointsPerVoxelPrefix = mMemPool.template alloc<uint32_t>(mData.voxelCount, mStream);
-    CALL_CUBS(DeviceScan::ExclusiveSum, mData.pointsPerVoxel, mData.pointsPerVoxelPrefix, mData.voxelCount);
-
-    mData.pointsPerLeaf = mMemPool.template alloc<uint32_t>(pointCount, mStream);
-    CALL_CUBS(DeviceRunLengthEncode::Encode, ShiftRightIterator<9>(mData.d_keys), d_keys, mData.pointsPerLeaf, d_node_count, pointCount);
-    cudaCheck(cudaMemcpyAsync(mData.nodeCount, d_node_count, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
-
-    if constexpr(is_same<BuildT, Point>::value) {
-        uint32_t *d_maxPointsPerLeaf = mMemPool.template alloc<uint32_t>(1, mStream);
-        CALL_CUBS(DeviceReduce::Max, mData.pointsPerLeaf, d_maxPointsPerLeaf, mData.nodeCount[0]);
-        cudaCheck(cudaMemcpyAsync(&mMaxPointsPerLeaf, d_maxPointsPerLeaf, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
-        //printf("\n Leaf count = %u, max points per leaf = %u\n", mData.nodeCount[0], mMaxPointsPerLeaf);
-        if (mMaxPointsPerLeaf > std::numeric_limits<uint16_t>::max()) {
-            throw std::runtime_error("Too many points per leaf: "+std::to_string(mMaxPointsPerLeaf));
-        }
-        mMemPool.free(d_maxPointsPerLeaf);
-    }
-
-    mData.pointsPerLeafPrefix = mMemPool.template alloc<uint32_t>(mData.nodeCount[0], mStream);
-    CALL_CUBS(DeviceScan::ExclusiveSum, mData.pointsPerLeaf, mData.pointsPerLeafPrefix, mData.nodeCount[0]);
-
-    mData.d_leaf_keys = mMemPool.template alloc<uint64_t>(mData.nodeCount[0], mStream);
-    cudaCheck(cudaMemcpyAsync(mData.d_leaf_keys, d_keys, mData.nodeCount[0]*sizeof(uint64_t), cudaMemcpyDeviceToDevice, mStream));
-
-    CALL_CUBS(DeviceSelect::Unique, ShiftRightIterator<12>(mData.d_leaf_keys), d_keys, d_node_count+1, mData.nodeCount[0]);// count lower nodes
-    cudaCheck(cudaMemcpyAsync(mData.nodeCount+1, d_node_count+1, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
-    mData.d_lower_keys = mMemPool.template alloc<uint64_t>(mData.nodeCount[1], mStream);
-    cudaCheck(cudaMemcpyAsync(mData.d_lower_keys, d_keys, mData.nodeCount[1]*sizeof(uint64_t), cudaMemcpyDeviceToDevice, mStream));
-
-    mMemPool.free(d_keys, d_node_count);
-    if (mVerbose==2) mTimer.stop();
-
-    //printf("Leaf count = %u, lower count = %u, upper count = %u\n", mData.nodeCount[0], mData.nodeCount[1], mData.nodeCount[2]);
-}// CudaPointsToGrid<BuildT>::countNodes
-
-//================================================================================================
-
-template <typename BuildT, typename AllocT>
-template <typename PtrT, typename BufferT>
-inline BufferT CudaPointsToGrid<BuildT, AllocT>::getBuffer(const PtrT, size_t pointCount, const BufferT &pool)
-{
-    auto sizeofPoint = [&]()->size_t{
-        switch (mPointType){
-        case PointType::PointID: return sizeof(uint32_t);
-        case PointType::World64: return sizeof(Vec3d);
-        case PointType::World32: return sizeof(Vec3f);
-        case PointType::Grid64:  return sizeof(Vec3d);
-        case PointType::Grid32:  return sizeof(Vec3f);
-        case PointType::Voxel32: return sizeof(Vec3f);
-        case PointType::Voxel16: return sizeof(Vec3u16);
-        case PointType::Voxel8:  return sizeof(Vec3u8);
-        case PointType::Default: return pointer_traits<PtrT>::element_size;
-        default: return size_t(0);// PointType::Disable
-        }
-    };
-
-    mData.grid  = 0;// grid is always stored at the start of the buffer!
-    mData.tree  = NanoGrid<BuildT>::memUsage(); // grid ends and tree begins
-    mData.root  = mData.tree  + NanoTree<BuildT>::memUsage(); // tree ends and root node begins
-    mData.upper = mData.root  + NanoRoot<BuildT>::memUsage(mData.nodeCount[2]); // root node ends and upper internal nodes begin
-    mData.lower = mData.upper + NanoUpper<BuildT>::memUsage()*mData.nodeCount[2]; // upper internal nodes ends and lower internal nodes begin
-    mData.leaf  = mData.lower + NanoLower<BuildT>::memUsage()*mData.nodeCount[1]; // lower internal nodes ends and leaf nodes begin
-    mData.meta  = mData.leaf  + NanoLeaf<BuildT>::DataType::memUsage()*mData.nodeCount[0];// leaf nodes end and blind meta data begins
-    mData.blind = mData.meta  + sizeof(GridBlindMetaData)*int( mPointType!=PointType::Disable ); // meta data ends and blind data begins
-    mData.size  = mData.blind + pointCount*sizeofPoint();// end of buffer
-
-    auto buffer = BufferT::create(mData.size, &pool, false);// only allocate buffer on the device
-    mData.d_bufferPtr = buffer.deviceData();
-    if (mData.d_bufferPtr == nullptr) throw std::runtime_error("Failed to allocate grid buffer on the device");
-    cudaCheck(cudaMemcpyAsync(mDeviceData, &mData, sizeof(Data), cudaMemcpyHostToDevice, mStream));// copy Data CPU -> GPU
-    return buffer;
-}// CudaPointsToGrid<BuildT>::getBuffer
-
-//================================================================================================
-
-template <typename BuildT, typename AllocT>
-template <typename PtrT>
-inline void CudaPointsToGrid<BuildT, AllocT>::processGridTreeRoot(const PtrT points, size_t pointCount)
-{
-    using Vec3T = typename remove_const<typename pointer_traits<PtrT>::element_type>::type;
-    cudaLambdaKernel<<<1, 1, 0, mStream>>>(1, [=] __device__(size_t, Data *d_data, PointType pointType) {
-       // process Root
-        auto &root = d_data->getRoot();
-        root.mBBox = CoordBBox(); // init to empty
-        root.mTableSize = d_data->nodeCount[2];
-        root.mBackground = NanoRoot<BuildT>::ValueType(0);// background_value
-        root.mMinimum = root.mMaximum = NanoRoot<BuildT>::ValueType(0);
-        root.mAverage = root.mStdDevi = NanoRoot<BuildT>::FloatType(0);
-
-        // process Tree
-        auto &tree = d_data->getTree();
-        tree.setRoot(&root);
-        tree.setFirstNode(&d_data->getUpper(0));
-        tree.setFirstNode(&d_data->getLower(0));
-        tree.setFirstNode(&d_data->getLeaf(0));
-        tree.mNodeCount[2] = tree.mTileCount[2] = d_data->nodeCount[2];
-        tree.mNodeCount[1] = tree.mTileCount[1] = d_data->nodeCount[1];
-        tree.mNodeCount[0] = tree.mTileCount[0] = d_data->nodeCount[0];
-        tree.mVoxelCount = d_data->voxelCount;
-
-        // process Grid
-        auto &grid = d_data->getGrid();
-        grid.init({GridFlags::HasBBox, GridFlags::IsBreadthFirst}, d_data->size, d_data->map, mapToGridType<BuildT>());
-        grid.mChecksum = ~uint64_t(0);// set all bits on which means it's disabled
-        grid.mBlindMetadataCount  = is_same<BuildT, Point>::value;// ? 1u : 0u;
-        grid.mBlindMetadataOffset = d_data->meta;
-        if (pointType != PointType::Disable) {
-            const auto lastLeaf = tree.mNodeCount[0] - 1;
-            grid.mData1 = d_data->pointsPerLeafPrefix[lastLeaf] + d_data->pointsPerLeaf[lastLeaf];
-            auto &meta = d_data->getMeta();
-            meta.mDataOffset = sizeof(GridBlindMetaData);// blind data is placed right after this meta data
-            meta.mValueCount = pointCount;
-            // Blind meta data
-            switch (pointType){
-            case PointType::PointID:
-                grid.mGridClass = GridClass::PointIndex;
-                meta.mSemantic  = GridBlindDataSemantic::PointId;
-                meta.mDataClass = GridBlindDataClass::IndexArray;
-                meta.mDataType  = mapToGridType<uint32_t>();
-                meta.mValueSize = sizeof(uint32_t);
-                cudaStrcpy(meta.mName, "PointID: uint32_t indices to points");
-                break;
-            case PointType::World64:
-                grid.mGridClass = GridClass::PointData;
-                meta.mSemantic  = GridBlindDataSemantic::WorldCoords;
-                meta.mDataClass = GridBlindDataClass::AttributeArray;
-                meta.mDataType  = mapToGridType<Vec3d>();
-                meta.mValueSize = sizeof(Vec3d);
-                cudaStrcpy(meta.mName, "World64: Vec3<double> point coordinates in world space");
-                break;
-            case PointType::World32:
-                grid.mGridClass = GridClass::PointData;
-                meta.mSemantic  = GridBlindDataSemantic::WorldCoords;
-                meta.mDataClass = GridBlindDataClass::AttributeArray;
-                meta.mDataType  = mapToGridType<Vec3f>();
-                meta.mValueSize = sizeof(Vec3f);
-                cudaStrcpy(meta.mName, "World32: Vec3<float> point coordinates in world space");
-                break;
-            case PointType::Grid64:
-                grid.mGridClass = GridClass::PointData;
-                meta.mSemantic  = GridBlindDataSemantic::GridCoords;
-                meta.mDataClass = GridBlindDataClass::AttributeArray;
-                meta.mDataType  = mapToGridType<Vec3d>();
-                meta.mValueSize = sizeof(Vec3d);
-                cudaStrcpy(meta.mName, "Grid64: Vec3<double> point coordinates in grid space");
-                break;
-            case PointType::Grid32:
-                grid.mGridClass = GridClass::PointData;
-                meta.mSemantic  = GridBlindDataSemantic::GridCoords;
-                meta.mDataClass = GridBlindDataClass::AttributeArray;
-                meta.mDataType  = mapToGridType<Vec3f>();
-                meta.mValueSize = sizeof(Vec3f);
-                cudaStrcpy(meta.mName, "Grid32: Vec3<float> point coordinates in grid space");
-                break;
-            case PointType::Voxel32:
-                grid.mGridClass = GridClass::PointData;
-                meta.mSemantic  = GridBlindDataSemantic::VoxelCoords;
-                meta.mDataClass = GridBlindDataClass::AttributeArray;
-                meta.mDataType  = mapToGridType<Vec3f>();
-                meta.mValueSize = sizeof(Vec3f);
-                cudaStrcpy(meta.mName, "Voxel32: Vec3<float> point coordinates in voxel space");
-                break;
-            case PointType::Voxel16:
-                grid.mGridClass = GridClass::PointData;
-                meta.mSemantic  = GridBlindDataSemantic::VoxelCoords;
-                meta.mDataClass = GridBlindDataClass::AttributeArray;
-                meta.mDataType  = mapToGridType<Vec3u16>();
-                meta.mValueSize = sizeof(Vec3u16);
-                cudaStrcpy(meta.mName, "Voxel16: Vec3<uint16_t> point coordinates in voxel space");
-                break;
-            case PointType::Voxel8:
-                grid.mGridClass = GridClass::PointData;
-                meta.mSemantic  = GridBlindDataSemantic::VoxelCoords;
-                meta.mDataClass = GridBlindDataClass::AttributeArray;
-                meta.mDataType  = mapToGridType<Vec3u8>();
-                meta.mValueSize = sizeof(Vec3u8);
-                cudaStrcpy(meta.mName, "Voxel8: Vec3<uint8_t> point coordinates in voxel space");
-                break;
-            case PointType::Default:
-                grid.mGridClass = GridClass::PointData;
-                meta.mSemantic  = GridBlindDataSemantic::WorldCoords;
-                meta.mDataClass = GridBlindDataClass::AttributeArray;
-                meta.mDataType  = mapToGridType<Vec3T>();
-                meta.mValueSize = sizeof(Vec3T);
-                if constexpr(is_same<Vec3T, Vec3f>::value) {
-                    cudaStrcpy(meta.mName, "World32: Vec3<float> point coordinates in world space");
-                } else if constexpr(is_same<Vec3T, Vec3d>::value){
-                    cudaStrcpy(meta.mName, "World64: Vec3<double> point coordinates in world space");
-                } else {
-                    printf("Error in CudaPointsToGrid<BuildT>::processGridTreeRoot: expected Vec3T = Vec3f or Vec3d\n");
-                }
-                break;
-            default:
-                printf("Error in CudaPointsToGrid<BuildT>::processGridTreeRoot: invalid pointType\n");
-            }
-        } else if constexpr(BuildTraits<BuildT>::is_offindex) {
-            grid.mData1 = 1u + 512u*d_data->nodeCount[0];
-            grid.mGridClass = GridClass::IndexGrid;
-        }
-    }, mDeviceData, mPointType);// cudaLambdaKernel
-    cudaCheckError();
-
-    char *dst = mData.getGrid().mGridName;
-    if (const char *src = mGridName.data()) {
-        cudaCheck(cudaMemcpyAsync(dst, src, GridData::MaxNameSize, cudaMemcpyHostToDevice, mStream));
-    } else {
-        cudaCheck(cudaMemsetAsync(dst, 0, GridData::MaxNameSize, mStream));
-    }
-}// CudaPointsToGrid<BuildT>::processGridTreeRoot
-
-//================================================================================================
-
-template <typename BuildT, typename AllocT>
-inline void CudaPointsToGrid<BuildT, AllocT>::processUpperNodes()
-{
-    cudaLambdaKernel<<<numBlocks(mData.nodeCount[2]), mNumThreads, 0, mStream>>>(mData.nodeCount[2], [=] __device__(size_t tid, Data *d_data) {
-        auto &root  = d_data->getRoot();
-        auto &upper = d_data->getUpper(tid);
-#if 1
-        auto keyToCoord = [](uint64_t key)->nanovdb::Coord{
-            static constexpr int64_t offset = 1 << 31;// max values of uint32_t is 2^31 - 1
-            static constexpr uint64_t MASK = (1u << 21) - 1; // used to mask out 21 lower bits
-            return nanovdb::Coord(int(int64_t(((key >> 42) & MASK) << 12) - offset),  // x are the upper 21 bits
-                                  int(int64_t(((key >> 21) & MASK) << 12) - offset),  // y are the middle 21 bits
-                                  int(int64_t(( key        & MASK) << 12) - offset)); // z are the lower 21 bits
-        };
-        const Coord ijk = keyToCoord(d_data->d_tile_keys[tid]);
-#else
-        const Coord ijk = NanoRoot<uint32_t>::KeyToCoord(d_data->d_tile_keys[tid]);
-#endif
-        root.tile(tid)->setChild(ijk, &upper, &root);
-        upper.mBBox[0] = ijk;
-        upper.mFlags = 0;
-        upper.mValueMask.setOff();
-        upper.mChildMask.setOff();
-        upper.mMinimum = upper.mMaximum = NanoLower<BuildT>::ValueType(0);
-        upper.mAverage = upper.mStdDevi = NanoLower<BuildT>::FloatType(0);
-    }, mDeviceData);
-    cudaCheckError();
-
-    mMemPool.free(mData.d_tile_keys);
-
-    const uint64_t valueCount = mData.nodeCount[2] << 15;
-    cudaLambdaKernel<<<numBlocks(valueCount), mNumThreads, 0, mStream>>>(valueCount, [=] __device__(size_t tid, Data *d_data) {
-        auto &upper = d_data->getUpper(tid >> 15);
-        upper.mTable[tid & 32767u].value = NanoUpper<BuildT>::ValueType(0);// background
-    }, mDeviceData);
-    cudaCheckError();
-}// CudaPointsToGrid<BuildT>::processUpperNodes
-
-//================================================================================================
-
-template <typename BuildT, typename AllocT>
-inline void CudaPointsToGrid<BuildT, AllocT>::processLowerNodes()
-{
-    cudaLambdaKernel<<<numBlocks(mData.nodeCount[1]), mNumThreads, 0, mStream>>>(mData.nodeCount[1], [=] __device__(size_t tid, Data *d_data) {
-        auto &root  = d_data->getRoot();
-        const uint64_t lowerKey = d_data->d_lower_keys[tid];
-        auto &upper = d_data->getUpper(lowerKey >> 15);
-        const uint32_t upperOffset = lowerKey & 32767u;// (1 << 15) - 1 = 32767
-        upper.mChildMask.setOnAtomic(upperOffset);
-        auto &lower = d_data->getLower(tid);
-        upper.setChild(upperOffset, &lower);
-        lower.mBBox[0] = upper.offsetToGlobalCoord(upperOffset);
-        lower.mFlags = 0;
-        lower.mValueMask.setOff();
-        lower.mChildMask.setOff();
-        lower.mMinimum = lower.mMaximum = NanoLower<BuildT>::ValueType(0);// background;
-        lower.mAverage = lower.mStdDevi = NanoLower<BuildT>::FloatType(0);
-    }, mDeviceData);
-    cudaCheckError();
-
-    const uint64_t valueCount = mData.nodeCount[1] << 12;
-    cudaLambdaKernel<<<numBlocks(valueCount), mNumThreads, 0, mStream>>>(valueCount, [=] __device__(size_t tid, Data *d_data) {
-        auto &lower = d_data->getLower(tid >> 12);
-        lower.mTable[tid & 4095u].value = NanoLower<BuildT>::ValueType(0);// background
-    }, mDeviceData);
-    cudaCheckError();
-}// CudaPointsToGrid<BuildT>::processLowerNodes
-
-//================================================================================================
-
-template <typename BuildT, typename AllocT>
-template <typename PtrT>
-inline void CudaPointsToGrid<BuildT, AllocT>::processLeafNodes(const PtrT points)
-{
-    const uint8_t flags = static_cast<uint8_t>(mData.flags.data());// mIncludeStats ? 16u : 0u;// 4th bit indicates stats
-
-    if (mVerbose==2) mTimer.start("process leaf meta data");
-    // loop over leaf nodes and add it to its parent node
-    cudaLambdaKernel<<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], [=] __device__(size_t tid, Data *d_data) {
-        const uint64_t leafKey = d_data->d_leaf_keys[tid], tile_id = leafKey >> 27;
-        auto &upper = d_data->getUpper(tile_id);
-        const uint32_t lowerOffset = leafKey & 4095u, upperOffset = (leafKey >> 12) & 32767u;
-        auto &lower = *upper.getChild(upperOffset);
-        lower.mChildMask.setOnAtomic(lowerOffset);
-        auto &leaf = d_data->getLeaf(tid);
-        lower.setChild(lowerOffset, &leaf);
-        leaf.mBBoxMin = lower.offsetToGlobalCoord(lowerOffset);
-        leaf.mFlags = flags;
-        auto &valueMask = leaf.mValueMask;
-        valueMask.setOff();// initiate all bits to off
-
-        if constexpr(is_same<Point, BuildT>::value) {
-            leaf.mOffset = d_data->pointsPerLeafPrefix[tid];
-            leaf.mPointCount = d_data->pointsPerLeaf[tid];
-        } else if constexpr(BuildTraits<BuildT>::is_offindex) {
-            leaf.mOffset = tid*512u + 1u;// background is index 0
-            leaf.mPrefixSum = 0u;
-        } else if constexpr(!BuildTraits<BuildT>::is_special) {
-            leaf.mAverage = leaf.mStdDevi = NanoLeaf<BuildT>::FloatType(0);
-            leaf.mMinimum = leaf.mMaximum = NanoLeaf<BuildT>::ValueType(0);
-        }
-    }, mDeviceData); cudaCheckError();
-
-    if (mVerbose==2) mTimer.restart("set active voxel state and values");
-    // loop over all active voxels and set LeafNode::mValueMask and LeafNode::mValues
-    cudaLambdaKernel<<<numBlocks(mData.voxelCount), mNumThreads, 0, mStream>>>(mData.voxelCount, [=] __device__(size_t tid, Data *d_data) {
-        const uint32_t pointID  = d_data->pointsPerVoxelPrefix[tid];
-        const uint64_t voxelKey = d_data->d_keys[pointID];
-        auto &upper = d_data->getUpper(voxelKey >> 36);
-        auto &lower = *upper.getChild((voxelKey >> 21) & 32767u);
-        auto &leaf  = *lower.getChild((voxelKey >>  9) &  4095u);
-        const uint32_t n = voxelKey & 511u;
-        leaf.mValueMask.setOnAtomic(n);// <--- slow!
-        if constexpr(is_same<Point, BuildT>::value) {
-            leaf.mValues[n] = uint16_t(pointID + d_data->pointsPerVoxel[tid] - leaf.offset());
-        } else if constexpr(!BuildTraits<BuildT>::is_special) {
-            leaf.mValues[n] = NanoLeaf<BuildT>::ValueType(1);// set value of active voxels that are not points (or index)
-        }
-    }, mDeviceData); cudaCheckError();
-
-    mMemPool.free(mData.d_keys, mData.pointsPerVoxel, mData.pointsPerVoxelPrefix, mData.pointsPerLeafPrefix, mData.pointsPerLeaf);
-
-    if (mVerbose==2) mTimer.restart("set inactive voxel values");
-    const uint64_t denseVoxelCount = mData.nodeCount[0] << 9;
-    cudaLambdaKernel<<<numBlocks(denseVoxelCount), mNumThreads, 0, mStream>>>(denseVoxelCount, [=] __device__(size_t tid, Data *d_data) {
-        auto &leaf = d_data->getLeaf(tid >> 9u);
-        const uint32_t n = tid & 511u;
-        if (leaf.mValueMask.isOn(n)) return;
-        if constexpr(is_same<BuildT, Point>::value) {
-            const uint32_t m = leaf.mValueMask.findPrev<true>(n - 1);
-            leaf.mValues[n] = m < 512u ? leaf.mValues[m] : 0u;
-        } else if constexpr(!BuildTraits<BuildT>::is_special) {
-            leaf.mValues[n] = NanoLeaf<BuildT>::ValueType(0);// value of inactive voxels
-        }
-    }, mDeviceData); cudaCheckError();
-
-    if constexpr(BuildTraits<BuildT>::is_onindex) {
-        if (mVerbose==2) mTimer.restart("prefix-sum for index grid");
-        uint64_t *devValueIndex = mMemPool.template alloc<uint64_t>(mData.nodeCount[0], mStream);
-        auto devValueIndexPrefix = mMemPool.template alloc<uint64_t>(mData.nodeCount[0], mStream);
-        kernels::fillValueIndexKernel<BuildT, AllocT><<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], devValueIndex, mDeviceData);
-        cudaCheckError();
-        CALL_CUBS(DeviceScan::InclusiveSum, devValueIndex, devValueIndexPrefix, mData.nodeCount[0]);
-        mMemPool.free(devValueIndex);
-        kernels::leafPrefixSumKernel<BuildT, AllocT><<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], devValueIndexPrefix, mDeviceData);
-        cudaCheckError();
-        mMemPool.free(devValueIndexPrefix);
-    }
-
-    if constexpr(BuildTraits<BuildT>::is_indexmask) {
-        if (mVerbose==2) mTimer.restart("leaf.mMask = leaf.mValueMask");
-        kernels::setMaskEqValMaskKernel<BuildT, AllocT><<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], mDeviceData);
-        cudaCheckError();
-    }
-    if (mVerbose==2) mTimer.stop();
-}// CudaPointsToGrid<BuildT>::processLeafNodes
-
-//================================================================================================
-
-template <typename BuildT, typename AllocT>
-template <typename PtrT>
-inline void CudaPointsToGrid<BuildT, AllocT>::processPoints(const PtrT, size_t)
-{
-    mMemPool.free(mData.d_indx, mStream);
-}
-
-//================================================================================================
-
-// Template specialization with BuildT = Point
-template <>
-template <typename PtrT>
-inline void CudaPointsToGrid<Point>::processPoints(const PtrT points, size_t pointCount)
-{
-    switch (mPointType){
-    case PointType::Disable:
-        throw std::runtime_error("CudaPointsToGrid<Point>::processPoints: mPointType == PointType::Disable\n");
-    case PointType::PointID:
-        cudaLambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
-            d_data->template getPoint<uint32_t>(tid) = d_data->d_indx[tid];
-        }, mDeviceData); cudaCheckError();
-        break;
-    case PointType::World64:
-        cudaLambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
-            d_data->template getPoint<Vec3d>(tid) = points[d_data->d_indx[tid]];
-        }, mDeviceData); cudaCheckError();
-        break;
-    case PointType::World32:
-        cudaLambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
-            d_data->template getPoint<Vec3f>(tid) = points[d_data->d_indx[tid]];
-        }, mDeviceData); cudaCheckError();
-        break;
-    case PointType::Grid64:
-        cudaLambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
-            d_data->template getPoint<Vec3d>(tid) = d_data->map.applyInverseMap(points[d_data->d_indx[tid]]);
-        }, mDeviceData); cudaCheckError();
-        break;
-    case PointType::Grid32:
-        cudaLambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
-            d_data->template getPoint<Vec3f>(tid) = d_data->map.applyInverseMapF(points[d_data->d_indx[tid]]);
-        }, mDeviceData); cudaCheckError();
-        break;
-    case PointType::Voxel32:
-        cudaLambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
-            worldToVoxel(d_data->template getPoint<Vec3f>(tid), points[d_data->d_indx[tid]], d_data->map);
-        }, mDeviceData); cudaCheckError();
-        break;
-    case PointType::Voxel16:
-        cudaLambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
-            worldToVoxel(d_data->template getPoint<Vec3u16>(tid), points[d_data->d_indx[tid]], d_data->map);
-        }, mDeviceData); cudaCheckError();
-        break;
-    case PointType::Voxel8:
-        cudaLambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
-            worldToVoxel(d_data->template getPoint<Vec3u8>(tid), points[d_data->d_indx[tid]], d_data->map);
-        }, mDeviceData); cudaCheckError();
-        break;
-    case PointType::Default:
-        cudaLambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
-            d_data->template getPoint<typename pointer_traits<PtrT>::element_type>(tid) = points[d_data->d_indx[tid]];
-        }, mDeviceData); cudaCheckError();
-        break;
-    default:
-        printf("Internal error in CudaPointsToGrid<Point>::processPoints\n");
-    }
-    mMemPool.free(mData.d_indx);
-}// CudaPointsToGrid<Point>::processPoints
-
-//================================================================================================
-
-template <typename BuildT, typename AllocT>
-inline void CudaPointsToGrid<BuildT, AllocT>::processBBox()
-{
-    if (mData.flags.isMaskOff(GridFlags::HasBBox)) {
-        mMemPool.free(mData.d_leaf_keys, mData.d_lower_keys);
-        return;
-    }
-
-    // reset bbox in lower nodes
-    cudaLambdaKernel<<<numBlocks(mData.nodeCount[1]), mNumThreads, 0, mStream>>>(mData.nodeCount[1], [=] __device__(size_t tid, Data *d_data) {
-        d_data->getLower(tid).mBBox = CoordBBox();
-    }, mDeviceData);
-    cudaCheckError();
-
-    // update and propagate bbox from leaf -> lower/parent nodes
-    cudaLambdaKernel<<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], [=] __device__(size_t tid, Data *d_data) {
-        const uint64_t leafKey = d_data->d_leaf_keys[tid];
-        auto &upper = d_data->getUpper(leafKey >> 27);
-        auto &lower = *upper.getChild((leafKey >> 12) & 32767u);
-        auto &leaf = d_data->getLeaf(tid);
-        leaf.updateBBox();
-        lower.mBBox.expandAtomic(leaf.bbox());
-    }, mDeviceData);
-    mMemPool.free(mData.d_leaf_keys);
-    cudaCheckError();
-
-    // reset bbox in upper nodes
-    cudaLambdaKernel<<<numBlocks(mData.nodeCount[2]), mNumThreads, 0, mStream>>>(mData.nodeCount[2], [=] __device__(size_t tid, Data *d_data) {
-        d_data->getUpper(tid).mBBox = CoordBBox();
-    }, mDeviceData);
-    cudaCheckError();
-
-    // propagate bbox from lower -> upper/parent node
-    cudaLambdaKernel<<<numBlocks(mData.nodeCount[1]), mNumThreads, 0, mStream>>>(mData.nodeCount[1], [=] __device__(size_t tid, Data *d_data) {
-        const uint64_t lowerKey = d_data->d_lower_keys[tid];
-        auto &upper = d_data->getUpper(lowerKey >> 15);
-        auto &lower = d_data->getLower(tid);
-        upper.mBBox.expandAtomic(lower.bbox());
-    }, mDeviceData);
-    mMemPool.free(mData.d_lower_keys);
-    cudaCheckError()
-
-    // propagate bbox from upper -> root/parent node
-    cudaLambdaKernel<<<numBlocks(mData.nodeCount[2]), mNumThreads, 0, mStream>>>(mData.nodeCount[2], [=] __device__(size_t tid, Data *d_data) {
-        d_data->getRoot().mBBox.expandAtomic(d_data->getUpper(tid).bbox());
-    }, mDeviceData);
-    cudaCheckError();
-
-    // update the world-bbox in the root node
-    cudaLambdaKernel<<<1, 1, 0, mStream>>>(1, [=] __device__(size_t, Data *d_data) {
-        d_data->getGrid().mWorldBBox = d_data->getRoot().mBBox.transform(d_data->map);
-    }, mDeviceData);
-    cudaCheckError();
-}// CudaPointsToGrid<BuildT>::processBBox
-
-//================================================================================================
-
-template<typename PtrT, typename BufferT, typename AllocT>
-GridHandle<BufferT>// Grid<Point> with PointType coordinates as blind data
-cudaPointsToGrid(const PtrT d_xyz, int pointCount, double voxelSize, PointType type, const BufferT &buffer, cudaStream_t stream)
-{
-    CudaPointsToGrid<Point, AllocT> converter(voxelSize, Vec3d(0.0), stream);
-    converter.setPointType(type);
-    return converter.getHandle(d_xyz, pointCount, buffer);
-}
-
-//================================================================================================
-
-template<typename BuildT, typename PtrT, typename BufferT, typename AllocT>
-GridHandle<BufferT>// Grid<BuildT>
-cudaVoxelsToGrid(const PtrT d_ijk, size_t voxelCount, double voxelSize, const BufferT &buffer, cudaStream_t stream)
-{
-    CudaPointsToGrid<BuildT, AllocT> converter(voxelSize, Vec3d(0.0), stream);
-    return converter.getHandle(d_ijk, voxelCount, buffer);
-}
-
-//================================================================================================
-
-template<typename BuildT, typename PtrT, typename BufferT, typename AllocT>
-GridHandle<BufferT>
-cudaPointsToGrid(std::vector<std::tuple<const PtrT,size_t,double,PointType>> vec, const BufferT &buffer, cudaStream_t stream)
-{
-    std::vector<GridHandle<BufferT>> handles;
-    for (auto &p : vec) handles.push_back(cudaPointsToGrid<BuildT, AllocT>(std::get<0>(p), std::get<1>(p), std::get<2>(p), std::get<3>(p), buffer, stream));
-    return mergeDeviceGrids(handles, stream);
-}
-
-//================================================================================================
-
-template<typename BuildT, typename PtrT, typename BufferT, typename AllocT>
-GridHandle<BufferT>
-cudaVoxelsToGrid(std::vector<std::tuple<const PtrT,size_t,double>> vec, const BufferT &buffer, cudaStream_t stream)
-{
-    std::vector<GridHandle<BufferT>> handles;
-    for (auto &p : vec) handles.push_back(cudaVoxelsToGrid<BuildT, PtrT, BufferT, AllocT>(std::get<0>(p), std::get<1>(p), std::get<2>(p), buffer, stream));
-    return mergeDeviceGrids(handles, stream);
-}
-
-}// nanovdb namespace
-
-#endif // NVIDIA_CUDA_POINTS_TO_GRID_CUH_HAS_BEEN_INCLUDED
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/cuda/PointsToGrid.cuh>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/tools/cuda/PointsToGrid.cuh instead.")
diff --git a/nanovdb/nanovdb/util/cuda/CudaSignedFloodFill.cuh b/nanovdb/nanovdb/util/cuda/CudaSignedFloodFill.cuh
index 2f4bf203d6..f9ba99b8fc 100644
--- a/nanovdb/nanovdb/util/cuda/CudaSignedFloodFill.cuh
+++ b/nanovdb/nanovdb/util/cuda/CudaSignedFloodFill.cuh
@@ -1,201 +1,6 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-/*!
-    \file CudaSignedFloodFill.cuh
-
-    \author Ken Museth
-
-    \date May 3, 2023
-
-    \brief Performs signed flood-fill operation on the hierarchical tree structure on the device
-
-    \todo This tools needs to handle the (extremely) rare case when root node
-          needs to be modified during the signed flood fill operation. This happens
-          when the root-table needs to be expanded with tile values (of size 4096^3)
-          that are completely inside the implicit surface.
-
-    \warning The header file contains cuda device code so be sure
-             to only include it in .cu files (or other .cuh files)
-*/
-
-#ifndef NANOVDB_CUDA_SIGNED_FLOOD_FILL_CUH_HAS_BEEN_INCLUDED
-#define NANOVDB_CUDA_SIGNED_FLOOD_FILL_CUH_HAS_BEEN_INCLUDED
-
-#include <nanovdb/NanoVDB.h>
-#include <nanovdb/util/GridHandle.h>
-#include <nanovdb/util/cuda/GpuTimer.h>
-#include <nanovdb/util/cuda/CudaUtils.h>
-#include <nanovdb/util/cuda/CudaGridChecksum.cuh>
-
-namespace nanovdb {
-
-/// @brief Performs signed flood-fill operation on the hierarchical tree structure on the device
-/// @tparam BuildT Build type of the grid to be flood-filled
-/// @param d_grid Non-const device pointer to the grid that will be flood-filled
-/// @param verbose If true timing information will be printed to the terminal
-/// @param stream optional cuda stream
-template<typename BuildT>
-typename enable_if<BuildTraits<BuildT>::is_float, void>::type
-cudaSignedFloodFill(NanoGrid<BuildT> *d_grid, bool verbose = false, cudaStream_t stream = 0);
-
-namespace {// anonymous namespace
-
-template<typename BuildT>
-class CudaSignedFloodFill
-{
-public:
-    CudaSignedFloodFill(bool verbose = false, cudaStream_t stream = 0)
-        : mStream(stream), mVerbose(verbose) {}
-
-    /// @brief Toggle on and off verbose mode
-    /// @param on if true verbose is turned on
-    void setVerbose(bool on = true) {mVerbose = on;}
-
-    void operator()(NanoGrid<BuildT> *d_grid);
-
-private:
-    cudaStream_t mStream{0};
-    GpuTimer     mTimer;
-    bool         mVerbose{false};
-
-};// CudaSignedFloodFill
-
-//================================================================================================
-
-template<typename BuildT>
-__global__ void cudaProcessRootNode(NanoTree<BuildT> *tree)
-{
-    // auto &root = tree->root();
-    /*
-    using ChildT = typename RootT::ChildNodeType;
-    // Insert the child nodes into a map sorted according to their origin
-    std::map<Coord, ChildT*> nodeKeys;
-    typename RootT::ChildOnIter it = root.beginChildOn();
-    for (; it; ++it) nodeKeys.insert(std::pair<Coord, ChildT*>(it.getCoord(), &(*it)));
-    static const Index DIM = RootT::ChildNodeType::DIM;
-
-    // We employ a simple z-scanline algorithm that inserts inactive tiles with
-    // the inside value if they are sandwiched between inside child nodes only!
-    typename std::map<Coord, ChildT*>::const_iterator b = nodeKeys.begin(), e = nodeKeys.end();
-    if ( b == e ) return;
-    for (typename std::map<Coord, ChildT*>::const_iterator a = b++; b != e; ++a, ++b) {
-        Coord d = b->first - a->first; // delta of neighboring coordinates
-        if (d[0]!=0 || d[1]!=0 || d[2]==Int32(DIM)) continue;// not same z-scanline or neighbors
-        const ValueT fill[] = { a->second->getLastValue(), b->second->getFirstValue() };
-        if (!(fill[0] < 0) || !(fill[1] < 0)) continue; // scanline isn't inside
-        Coord c = a->first + Coord(0u, 0u, DIM);
-        for (; c[2] != b->first[2]; c[2] += DIM) root.addTile(c, mInside, false);
-    }
-    */
-    //root.setBackground(mOutside, /*updateChildNodes=*/false);
-}// cudaProcessRootNode
-
-//================================================================================================
-
-template<typename BuildT, int LEVEL>
-__global__ void cudaProcessInternalNodes(NanoTree<BuildT> *tree, size_t count)
-{
-    using NodeT = typename NanoNode<BuildT, LEVEL>::type;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= count) return;
-    const uint32_t nValue = tid & (NodeT::SIZE - 1u);
-    auto &node = *(tree->template getFirstNode<LEVEL>() + (tid >> (3*NodeT::LOG2DIM)));
-    const auto &mask = node.childMask();
-    if (mask.isOn(nValue)) return;// ignore if child
-    auto value = tree->background();// initiate to outside value
-    auto n = mask.template findNext<true>(nValue);
-    if (n < NodeT::SIZE) {
-        if (node.getChild(n)->getFirstValue() < 0) value = -value;
-    } else if ((n = mask.template findPrev<true>(nValue)) < NodeT::SIZE) {
-        if (node.getChild(n)->getLastValue()  < 0) value = -value;
-    } else if (node.getValue(0)<0) {
-        value = -value;
-    }
-    node.setValue(nValue, value);
-}// cudaProcessInternalNodes
-
-//================================================================================================
-
-template<typename BuildT>
-__global__ void cudaProcessLeafNodes(NanoTree<BuildT> *tree, size_t count)
-{
-    using LeafT = NanoLeaf<BuildT>;
-    const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= count) return;
-    const uint32_t nVoxel = tid & (LeafT::SIZE - 1u);
-    auto *leaf = tree->getFirstLeaf() + (tid >> (3*LeafT::LOG2DIM));
-    const auto &mask = leaf->valueMask();
-    if (mask.isOn(nVoxel)) return;
-    auto *buffer = leaf->mValues;
-    auto n = mask.template findNext<true>(nVoxel);
-    if (n == LeafT::SIZE && (n = mask.template findPrev<true>(nVoxel)) == LeafT::SIZE) n = 0u;
-    buffer[nVoxel] = buffer[n]<0 ? -tree->background() : tree->background();
-}// cudaProcessLeafNodes
-
-//================================================================================================
-
-template <typename BuildT>
-__global__ void cudaCpyNodeCount(NanoGrid<BuildT> *d_grid, uint64_t *d_count)
-{
-    NANOVDB_ASSERT(d_grid->isSequential());
-    for (int i=0; i<3; ++i) *d_count++ = d_grid->tree().nodeCount(i);
-    *d_count = d_grid->tree().root().tileCount();
-}
-
-}// anonymous namespace
-
-//================================================================================================
-
-template <typename BuildT>
-void CudaSignedFloodFill<BuildT>::operator()(NanoGrid<BuildT> *d_grid)
-{
-    static_assert(BuildTraits<BuildT>::is_float, "CudaSignedFloodFill only works on float grids");
-    NANOVDB_ASSERT(d_grid);
-    uint64_t count[4], *d_count = nullptr;
-    cudaCheck(cudaMallocAsync((void**)&d_count, 4*sizeof(uint64_t), mStream));
-    cudaCpyNodeCount<BuildT><<<1, 1, 0, mStream>>>(d_grid, d_count);
-    cudaCheckError();
-    cudaCheck(cudaMemcpyAsync(&count, d_count, 4*sizeof(uint64_t), cudaMemcpyDeviceToHost, mStream));
-    cudaCheck(cudaFreeAsync(d_count, mStream));
-
-    static const int threadsPerBlock = 128;
-    auto blocksPerGrid = [&](size_t count)->uint32_t{return (count + (threadsPerBlock - 1)) / threadsPerBlock;};
-    auto *tree = reinterpret_cast<NanoTree<BuildT>*>(d_grid + 1);
-
-    if (mVerbose) mTimer.start("\nProcess leaf nodes");
-    cudaProcessLeafNodes<BuildT><<<blocksPerGrid(count[0]<<9), threadsPerBlock, 0, mStream>>>(tree, count[0]<<9);
-    cudaCheckError();
-
-    if (mVerbose) mTimer.restart("Process lower internal nodes");
-    cudaProcessInternalNodes<BuildT,1><<<blocksPerGrid(count[1]<<12), threadsPerBlock, 0, mStream>>>(tree, count[1]<<12);
-    cudaCheckError();
-
-    if (mVerbose) mTimer.restart("Process upper internal nodes");
-    cudaProcessInternalNodes<BuildT,2><<<blocksPerGrid(count[2]<<15), threadsPerBlock, 0, mStream>>>(tree, count[2]<<15);
-    cudaCheckError();
-
-    //if (mVerbose) mTimer.restart("Process root node");
-    //cudaProcessRootNode<BuildT><<<1, 1, 0, mStream>>>(tree);
-    if (mVerbose) mTimer.stop();
-    cudaCheckError();
-}// CudaSignedFloodFill::operator()
-
-//================================================================================================
-
-template<typename BuildT>
-typename enable_if<BuildTraits<BuildT>::is_float, void>::type
-cudaSignedFloodFill(NanoGrid<BuildT> *d_grid, bool verbose, cudaStream_t stream)
-{
-    CudaSignedFloodFill<BuildT> sff(verbose, stream);
-    sff(d_grid);
-    auto *d_gridData = d_grid->data();
-    GridChecksum cs = cudaGetGridChecksum(d_gridData, stream);
-    if (cs.mode() == ChecksumMode::Full) {// ChecksumMode::Partial checksum is unaffected
-        cudaGridChecksum(d_gridData, ChecksumMode::Full, stream);
-    }
-}
-
-}// nanovdb namespace
-
-#endif // NANOVDB_CUDA_SIGNED_FLOOD_FILL_CUH_HAS_BEEN_INCLUDED
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/cuda/SignedFloodFill.cuh>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/tools/cuda/SignedFloodFill.cuh instead.")
diff --git a/nanovdb/nanovdb/util/cuda/CudaUtils.h b/nanovdb/nanovdb/util/cuda/CudaUtils.h
index 40001748ee..e154ff9b4e 100644
--- a/nanovdb/nanovdb/util/cuda/CudaUtils.h
+++ b/nanovdb/nanovdb/util/cuda/CudaUtils.h
@@ -1,136 +1,6 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-#ifndef NANOVDB_CUDA_UTILS_H_HAS_BEEN_INCLUDED
-#define NANOVDB_CUDA_UTILS_H_HAS_BEEN_INCLUDED
-
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-
-//#if defined(DEBUG) || defined(_DEBUG)
-    static inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
-    {
-        if (code != cudaSuccess) {
-            fprintf(stderr, "CUDA error %u: %s (%s:%d)\n", unsigned(code), cudaGetErrorString(code), file, line);
-            //fprintf(stderr, "CUDA Runtime Error: %s %s %d\n", cudaGetErrorString(code), file, line);
-            if (abort) exit(code);
-        }
-    }
-    static inline void ptrAssert(const void* ptr, const char* msg, const char* file, int line, bool abort = true)
-    {
-        if (ptr == nullptr) {
-            fprintf(stderr, "NULL pointer error: %s %s %d\n", msg, file, line);
-            if (abort) exit(1);
-        } else if (uint64_t(ptr) % NANOVDB_DATA_ALIGNMENT) {
-            fprintf(stderr, "Pointer misalignment error: %s %s %d\n", msg, file, line);
-            if (abort) exit(1);
-        }
-    }
-//#else
-//    static inline void gpuAssert(cudaError_t, const char*, int, bool = true){}
-//    static inline void ptrAssert(void*, const char*, const char*, int, bool = true){}
-//#endif
-
-// Convenience function for checking CUDA runtime API results
-// can be wrapped around any runtime API call. No-op in release builds.
-#define cudaCheck(ans) \
-    { \
-        gpuAssert((ans), __FILE__, __LINE__); \
-    }
-
-#define checkPtr(ptr, msg) \
-    { \
-        ptrAssert((ptr), (msg), __FILE__, __LINE__); \
-    }
-
-#define cudaSync() \
-    { \
-        cudaCheck(cudaDeviceSynchronize()); \
-    }
-
-#define cudaCheckError() \
-    { \
-        cudaCheck(cudaGetLastError()); \
-    }
-
-#if CUDART_VERSION < 11020  // 11.2 introduced cudaMallocAsync and cudaFreeAsync
-
-/// @brief Dummy implementation of cudaMallocAsync that calls cudaMalloc
-/// @param d_ptr Device pointer to allocated device memory
-/// @param size  Number of bytes to allocate
-/// @param dummy The stream establishing the stream ordering contract and the memory pool to allocate from (ignored)
-/// @return Cuda error code
-inline cudaError_t cudaMallocAsync(void** d_ptr, size_t size, cudaStream_t){return cudaMalloc(d_ptr, size);}
-
-/// @brief Dummy implementation of cudaFreeAsync that calls cudaFree
-/// @param d_ptr Device pointer that will be freed
-/// @param dummy The stream establishing the stream ordering promise (ignored)
-/// @return Cuda error code
-inline cudaError_t cudaFreeAsync(void* d_ptr, cudaStream_t){return cudaFree(d_ptr);}
-
-#endif
-
-#if defined(__CUDACC__)// the following functions only run on the GPU!
-
-// --- Wrapper for launching lambda kernels
-template<typename Func, typename... Args>
-__global__ void cudaLambdaKernel(const size_t numItems, Func func, Args... args)
-{
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= numItems) return;
-    func(tid, args...);
-}
-
-/// @brief Copy characters from @c src to @c dst on the device.
-/// @param dst pointer to the character array to write to.
-/// @param src pointer to the null-terminated character string to copy from.
-/// @return pointer to the character array being written to.
-/// @note Emulates the behaviour of std::strcpy.
-__device__ inline char* cudaStrcpy(char *dst, const char *src)
-{
-    char *p = dst;
-    do {*p++ = *src;} while(*src++);
-    return dst;
-}
-
-/// @brief Appends a copy of the character string pointed to by @c src to
-///        the end of the character string pointed to by @c dst on the device.
-/// @param dst pointer to the null-terminated byte string to append to.
-/// @param src pointer to the null-terminated byte string to copy from.
-/// @return pointer to the character array being appended to.
-/// @note Emulates the behaviour of std::strcat.
-__device__ inline char* cudaStrcat(char *dst, const char *src)
-{
-    char *p = dst;
-    while (*p) ++p;
-    cudaStrcpy(p, src);
-    return dst;
-}
-
-/// @brief Compares two null-terminated byte strings lexicographically on the device.
-/// @param lhs pointer to the null-terminated byte strings to compare
-/// @param rhs pointer to the null-terminated byte strings to compare
-/// @return Negative value if @c lhs appears before @c rhs in lexicographical order.
-///         Zero if @c lhs and @c rhs compare equal. Positive value if @c lhs appears
-///         after @c rhs in lexicographical order.
-__device__ inline int cudaStrcmp(const char *lhs, const char *rhs)
-{
-    while(*lhs && (*lhs == *rhs)){
-        lhs++;
-        rhs++;
-    }
-    return *(const unsigned char*)lhs - *(const unsigned char*)rhs;// zero if lhs == rhs
-}
-
-/// @brief Test if two null-terminated byte strings are the same
-/// @param lhs pointer to the null-terminated byte strings to compare
-/// @param rhs pointer to the null-terminated byte strings to compare
-/// @return true if the two c-strings are identical
-__device__ inline bool cudaStrEq(const char *lhs, const char *rhs)
-{
-    return cudaStrcmp(lhs, rhs) == 0;
-}
-
-#endif// __CUDACC__
-
-#endif// NANOVDB_CUDA_UTILS_H_HAS_BEEN_INCLUDED
\ No newline at end of file
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/util/cuda/Util.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/util/cuda/Util.h instead.")
\ No newline at end of file
diff --git a/nanovdb/nanovdb/util/cuda/GpuTimer.h b/nanovdb/nanovdb/util/cuda/GpuTimer.h
index 6c6e217403..be7f81b227 100644
--- a/nanovdb/nanovdb/util/cuda/GpuTimer.h
+++ b/nanovdb/nanovdb/util/cuda/GpuTimer.h
@@ -1,110 +1,6 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 
-/// @file GpuTimer.h
-///
-/// @author Ken Museth
-///
-/// @brief A simple GPU timing class
-
-#ifndef NANOVDB_GPU_TIMER_H_HAS_BEEN_INCLUDED
-#define NANOVDB_GPU_TIMER_H_HAS_BEEN_INCLUDED
-
-#include <iostream>// for std::cerr
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-
-namespace nanovdb {
-
-class GpuTimer
-{
-    cudaStream_t mStream{0};
-    cudaEvent_t mStart, mStop;
-
-public:
-    /// @brief Default constructor
-    /// @param stream CUDA stream to be timed (defaults to stream 0)
-    /// @note Starts the timer
-    GpuTimer(cudaStream_t stream = 0) : mStream(stream)
-    {
-        cudaEventCreate(&mStart);
-        cudaEventCreate(&mStop);
-        cudaEventRecord(mStart, mStream);
-    }
-
-    /// @brief Construct and start the timer
-    /// @param msg string message to be printed when timer is started
-    /// @param stream CUDA stream to be timed (defaults to stream 0)
-    /// @param os output stream for the message above
-    GpuTimer(const std::string &msg, cudaStream_t stream = 0, std::ostream& os = std::cerr)
-        : mStream(stream)
-    {
-        os << msg << " ... " << std::flush;
-        cudaEventCreate(&mStart);
-        cudaEventCreate(&mStop);
-        cudaEventRecord(mStart, mStream);
-    }
-
-    /// @brief Destructor
-    ~GpuTimer()
-    {
-        cudaEventDestroy(mStart);
-        cudaEventDestroy(mStop);
-    }
-
-    /// @brief Start the timer
-    /// @param stream CUDA stream to be timed (defaults to stream 0)
-    /// @param os output stream for the message above
-    void start() {cudaEventRecord(mStart, mStream);}
-
-    /// @brief Start the timer
-    /// @param msg string message to be printed when timer is started
-
-    /// @param os output stream for the message above
-    void start(const std::string &msg, std::ostream& os = std::cerr)
-    {
-        os << msg << " ... " << std::flush;
-        this->start();
-    }
-
-    /// @brief Start the timer
-    /// @param msg string message to be printed when timer is started
-    /// @param os output stream for the message above
-    void start(const char* msg, std::ostream& os = std::cerr)
-    {
-        os << msg << " ... " << std::flush;
-        this->start();
-    }
-
-    /// @brief elapsed time (since start) in miliseconds
-    /// @return elapsed time (since start) in miliseconds
-    float elapsed()
-    {
-        cudaEventRecord(mStop, mStream);
-        cudaEventSynchronize(mStop);
-        float diff = 0.0f;
-        cudaEventElapsedTime(&diff, mStart, mStop);
-        return diff;
-    }
-
-    /// @brief stop the timer
-    /// @param os output stream for the message above
-    void stop(std::ostream& os = std::cerr)
-    {
-        float diff = this->elapsed();
-        os << "completed in " << diff << " milliseconds" << std::endl;
-    }
-
-    /// @brief stop and start the timer
-    /// @param msg string message to be printed when timer is started
-    /// @warning Remember to call start before restart
-    void restart(const std::string &msg, std::ostream& os = std::cerr)
-    {
-        this->stop();
-        this->start(msg, os);
-    }
-};// GpuTimer
-
-} // namespace nanovdb
-
-#endif // NANOVDB_GPU_TIMER_H_HAS_BEEN_INCLUDED
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/util/cuda/Timer.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/util/cuda/Timer.h instead.")
\ No newline at end of file
diff --git a/nanovdb/nanovdb/util/cuda/Timer.h b/nanovdb/nanovdb/util/cuda/Timer.h
new file mode 100644
index 0000000000..1bb7224461
--- /dev/null
+++ b/nanovdb/nanovdb/util/cuda/Timer.h
@@ -0,0 +1,116 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/// @file nanovdb/util/cuda/Timer.h
+///
+/// @author Ken Museth
+///
+/// @brief A simple GPU timing class
+
+#ifndef NANOVDB_UTIL_CUDA_TIMER_H_HAS_BEEN_INCLUDED
+#define NANOVDB_UTIL_CUDA_TIMER_H_HAS_BEEN_INCLUDED
+
+#include <iostream>// for std::cerr
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+namespace nanovdb {
+
+namespace util::cuda {
+
+class Timer
+{
+    cudaStream_t mStream{0};
+    cudaEvent_t mStart, mStop;
+
+public:
+    /// @brief Default constructor
+    /// @param stream CUDA stream to be timed (defaults to stream 0)
+    /// @note Starts the timer
+    Timer(cudaStream_t stream = 0) : mStream(stream)
+    {
+        cudaEventCreate(&mStart);
+        cudaEventCreate(&mStop);
+        cudaEventRecord(mStart, mStream);
+    }
+
+    /// @brief Construct and start the timer
+    /// @param msg string message to be printed when timer is started
+    /// @param stream CUDA stream to be timed (defaults to stream 0)
+    /// @param os output stream for the message above
+    Timer(const std::string &msg, cudaStream_t stream = 0, std::ostream& os = std::cerr)
+        : mStream(stream)
+    {
+        os << msg << " ... " << std::flush;
+        cudaEventCreate(&mStart);
+        cudaEventCreate(&mStop);
+        cudaEventRecord(mStart, mStream);
+    }
+
+    /// @brief Destructor
+    ~Timer()
+    {
+        cudaEventDestroy(mStart);
+        cudaEventDestroy(mStop);
+    }
+
+    /// @brief Start the timer
+    /// @param stream CUDA stream to be timed (defaults to stream 0)
+    /// @param os output stream for the message above
+    void start() {cudaEventRecord(mStart, mStream);}
+
+    /// @brief Start the timer
+    /// @param msg string message to be printed when timer is started
+
+    /// @param os output stream for the message above
+    void start(const std::string &msg, std::ostream& os = std::cerr)
+    {
+        os << msg << " ... " << std::flush;
+        this->start();
+    }
+
+    /// @brief Start the timer
+    /// @param msg string message to be printed when timer is started
+    /// @param os output stream for the message above
+    void start(const char* msg, std::ostream& os = std::cerr)
+    {
+        os << msg << " ... " << std::flush;
+        this->start();
+    }
+
+    /// @brief elapsed time (since start) in miliseconds
+    /// @return elapsed time (since start) in miliseconds
+    float elapsed()
+    {
+        cudaEventRecord(mStop, mStream);
+        cudaEventSynchronize(mStop);
+        float diff = 0.0f;
+        cudaEventElapsedTime(&diff, mStart, mStop);
+        return diff;
+    }
+
+    /// @brief stop the timer
+    /// @param os output stream for the message above
+    void stop(std::ostream& os = std::cerr)
+    {
+        float diff = this->elapsed();
+        os << "completed in " << diff << " milliseconds" << std::endl;
+    }
+
+    /// @brief stop and start the timer
+    /// @param msg string message to be printed when timer is started
+    /// @warning Remember to call start before restart
+    void restart(const std::string &msg, std::ostream& os = std::cerr)
+    {
+        this->stop();
+        this->start(msg, os);
+    }
+};// Timer
+
+}// namespace util::cuda
+
+using GpuTimer [[deprecated("Use nanovdb::util::cuda::Timer instead")]]= util::cuda::Timer;
+
+} // namespace nanovdb
+
+#endif // NANOVDB_UTIL_CUDA_TIMER_H_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/util/cuda/Util.h b/nanovdb/nanovdb/util/cuda/Util.h
new file mode 100644
index 0000000000..8ebfde61e2
--- /dev/null
+++ b/nanovdb/nanovdb/util/cuda/Util.h
@@ -0,0 +1,193 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    \file nanovdb/util/cuda/Util.h
+
+    \author Ken Museth
+
+    \date December 20, 2023
+
+    \brief Cuda specific utility functions
+*/
+
+#ifndef NANOVDB_UTIL_CUDA_UTIL_H_HAS_BEEN_INCLUDED
+#define NANOVDB_UTIL_CUDA_UTIL_H_HAS_BEEN_INCLUDED
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <nanovdb/util/Util.h> // for stderr and NANOVDB_ASSERT
+
+// change 1 -> 0 to only perform asserts during debug builds
+#if 1 || defined(DEBUG) || defined(_DEBUG)
+    static inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
+    {
+        if (code != cudaSuccess) {
+            fprintf(stderr, "CUDA error %u: %s (%s:%d)\n", unsigned(code), cudaGetErrorString(code), file, line);
+            //fprintf(stderr, "CUDA Runtime Error: %s %s %d\n", cudaGetErrorString(code), file, line);
+            if (abort) exit(code);
+        }
+    }
+    static inline void ptrAssert(const void* ptr, const char* msg, const char* file, int line, bool abort = true)
+    {
+        if (ptr == nullptr) {
+            fprintf(stderr, "NULL pointer error: %s %s %d\n", msg, file, line);
+            if (abort) exit(1);
+        } else if (uint64_t(ptr) % 32) {
+            fprintf(stderr, "Pointer misalignment error: %s %s %d\n", msg, file, line);
+            if (abort) exit(1);
+        }
+    }
+#else
+    static inline void gpuAssert(cudaError_t, const char*, int, bool = true){}
+    static inline void ptrAssert(void*, const char*, const char*, int, bool = true){}
+#endif
+
+// Convenience function for checking CUDA runtime API results
+// can be wrapped around any runtime API call. No-op in release builds.
+#define cudaCheck(ans) \
+    { \
+        gpuAssert((ans), __FILE__, __LINE__); \
+    }
+
+#define checkPtr(ptr, msg) \
+    { \
+        ptrAssert((ptr), (msg), __FILE__, __LINE__); \
+    }
+
+#define cudaSync() \
+    { \
+        cudaCheck(cudaDeviceSynchronize()); \
+    }
+
+#define cudaCheckError() \
+    { \
+        cudaCheck(cudaGetLastError()); \
+    }
+
+namespace nanovdb {// =========================================================
+
+namespace util::cuda {// ======================================================
+
+//#define NANOVDB_USE_SYNC_CUDA_MALLOC
+// cudaMallocAsync and cudaFreeAsync were introduced in CUDA 11.2 so we introduce
+// custom implementations that map to cudaMalloc and cudaFree below. If NANOVDB_USE_SYNC_CUDA_MALLOC
+// is defined these implementations will also be defined, which is useful in virtualized environments
+// that slice up the GPU and share it between instances as vGPU's. GPU unified memory is usually disabled
+// out of security considerations. Asynchronous CUDA malloc/free depends on GPU unified memory, so it
+// is not possible to use cudaMallocAsync and cudaFreeAsync in such environments.
+
+#if (CUDART_VERSION < 11020) || defined(NANOVDB_USE_SYNC_CUDA_MALLOC) // 11.2 introduced cudaMallocAsync and cudaFreeAsync
+
+/// @brief Simple wrapper that calls cudaMalloc
+/// @param d_ptr Device pointer to allocated device memory
+/// @param size  Number of bytes to allocate
+/// @param dummy The stream establishing the stream ordering contract and the memory pool to allocate from (ignored)
+/// @return Cuda error code
+inline cudaError_t mallocAsync(void** d_ptr, size_t size, cudaStream_t){return cudaMalloc(d_ptr, size);}
+
+/// @brief Simple wrapper that calls cudaFree
+/// @param d_ptr Device pointer that will be freed
+/// @param dummy The stream establishing the stream ordering promise (ignored)
+/// @return Cuda error code
+inline cudaError_t freeAsync(void* d_ptr, cudaStream_t){return cudaFree(d_ptr);}
+
+#else
+
+/// @brief Simple wrapper that calls cudaMallocAsync
+/// @param d_ptr Device pointer to allocated device memory
+/// @param size  Number of bytes to allocate
+/// @param stream The stream establishing the stream ordering contract and the memory pool to allocate from
+/// @return Cuda error code
+inline cudaError_t mallocAsync(void** d_ptr, size_t size, cudaStream_t stream){return cudaMallocAsync(d_ptr, size, stream);}
+
+/// @brief Simple wrapper that calls cudaFreeAsync
+/// @param d_ptr Device pointer that will be freed
+/// @param stream The stream establishing the stream ordering promise
+/// @return Cuda error code
+inline cudaError_t freeAsync(void* d_ptr, cudaStream_t stream){return cudaFreeAsync(d_ptr, stream);}
+
+#endif
+
+/// @brief Simple (naive) implementation of a unique device pointer
+///        using stream ordered memory allocation and deallocation.
+/// @tparam T Type of the device pointer
+template <typename T>
+class unique_ptr
+{
+    T           *mPtr;// pointer to stream ordered memory allocation
+    cudaStream_t mStream;
+public:
+    unique_ptr(size_t count = 0, cudaStream_t stream = 0) : mPtr(nullptr), mStream(stream)
+    {
+        if (count>0) cudaCheck(mallocAsync((void**)&mPtr, count*sizeof(T), stream));
+    }
+    unique_ptr(const unique_ptr&) = delete;
+    unique_ptr(unique_ptr&& other) : mPtr(other.mPtr), mStream(other.mStream)
+    {
+        other.mPtr = nullptr;
+    }
+    ~unique_ptr()
+    {
+        if (mPtr) cudaCheck(freeAsync(mPtr, mStream));
+    }
+    unique_ptr& operator=(const unique_ptr&) = delete;
+    unique_ptr& operator=(unique_ptr&& rhs) noexcept
+    {
+        mPtr = rhs.mPtr;
+        mStream = rhs.mStream;
+        rhs.mPtr = nullptr;
+        return *this;
+    }
+    void reset() {
+        if (mPtr) {
+            cudaCheck(freeAsync(mPtr, mStream));
+            mPtr = nullptr;
+        }
+    }
+    T* get()                 const {return mPtr;}
+    explicit operator bool() const {return mPtr != nullptr;}
+};// util::cuda::unique_ptr
+
+/// @brief Computes the number of blocks per grid given the problem size and number of threads per block
+/// @param numItems Problem size
+/// @param threadsPerBlock Number of threads per block (second CUDA launch parameter)
+/// @return number of blocks per grid (first CUDA launch parameter)
+/// @note CUDA launch parameters: kernel<<< blocksPerGrid, threadsPerBlock, sharedMemSize, streamID>>>
+inline size_t blocksPerGrid(size_t numItems, size_t threadsPerBlock)
+{
+    NANOVDB_ASSERT(numItems > 0 && threadsPerBlock >= 32 && threadsPerBlock % 32 == 0);
+    return (numItems + threadsPerBlock - 1) / threadsPerBlock;
+}
+
+
+#if defined(__CUDACC__)// the following functions only run on the GPU!
+
+/// @brief Cuda kernel that launches device lambda functions
+/// @param numItems Problem size
+template<typename Func, typename... Args>
+__global__ void lambdaKernel(const size_t numItems, Func func, Args... args)
+{
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= numItems) return;
+    func(tid, args...);
+}// util::cuda::lambdaKernel
+
+#endif// __CUDACC__
+
+}// namespace util::cuda ============================================================
+
+}// namespace nanovdb ===============================================================
+
+#if defined(__CUDACC__)// the following functions only run on the GPU!
+template<typename Func, typename... Args>
+[[deprecated("Use nanovdb::cuda::lambdaKernel instead")]]
+__global__ void cudaLambdaKernel(const size_t numItems, Func func, Args... args)
+{
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= numItems) return;
+    func(tid, args...);
+}
+#endif// __CUDACC__
+
+#endif// NANOVDB_UTIL_CUDA_UTIL_H_HAS_BEEN_INCLUDED
\ No newline at end of file
diff --git a/openvdb/openvdb/CMakeLists.txt b/openvdb/openvdb/CMakeLists.txt
index f56fdc198c..6f0826eb81 100644
--- a/openvdb/openvdb/CMakeLists.txt
+++ b/openvdb/openvdb/CMakeLists.txt
@@ -370,6 +370,7 @@ set(OPENVDB_LIBRARY_SOURCE_FILES
   points/AttributeSet.cc
   points/StreamCompression.cc
   points/points.cc
+  util/Assert.cc
   util/Formats.cc
 )
 
@@ -538,6 +539,7 @@ set(OPENVDB_LIBRARY_TREE_INCLUDE_FILES
 )
 
 set(OPENVDB_LIBRARY_UTIL_INCLUDE_FILES
+  util/Assert.h
   util/CpuTimer.h
   util/ExplicitInstantiation.h
   util/Formats.h
diff --git a/openvdb/openvdb/Grid.h b/openvdb/openvdb/Grid.h
index aedb005837..14aad39122 100644
--- a/openvdb/openvdb/Grid.h
+++ b/openvdb/openvdb/Grid.h
@@ -10,9 +10,9 @@
 #include "io/io.h"
 #include "math/Transform.h"
 #include "tree/Tree.h"
+#include "util/Assert.h"
 #include "util/logging.h"
 #include "util/Name.h"
-#include <cassert>
 #include <iostream>
 #include <set>
 #include <type_traits>
@@ -1610,7 +1610,7 @@ Grid<TreeT>::readBuffers(std::istream& is)
         uint16_t numPasses = 1;
         is.read(reinterpret_cast<char*>(&numPasses), sizeof(uint16_t));
         const io::StreamMetadata::Ptr meta = io::getStreamMetadataPtr(is);
-        assert(bool(meta));
+        OPENVDB_ASSERT(bool(meta));
         for (uint16_t passIndex = 0; passIndex < numPasses; ++passIndex) {
             uint32_t pass = (uint32_t(numPasses) << 16) | uint32_t(passIndex);
             meta->setPass(pass);
@@ -1632,7 +1632,7 @@ Grid<TreeT>::readBuffers(std::istream& is, const CoordBBox& bbox)
         uint16_t numPasses = 1;
         is.read(reinterpret_cast<char*>(&numPasses), sizeof(uint16_t));
         const io::StreamMetadata::Ptr meta = io::getStreamMetadataPtr(is);
-        assert(bool(meta));
+        OPENVDB_ASSERT(bool(meta));
         for (uint16_t passIndex = 0; passIndex < numPasses; ++passIndex) {
             uint32_t pass = (uint32_t(numPasses) << 16) | uint32_t(passIndex);
             meta->setPass(pass);
@@ -1662,7 +1662,7 @@ Grid<TreeT>::writeBuffers(std::ostream& os) const
     } else {
         // Determine how many leaf buffer passes are required for this grid
         const io::StreamMetadata::Ptr meta = io::getStreamMetadataPtr(os);
-        assert(bool(meta));
+        OPENVDB_ASSERT(bool(meta));
         uint16_t numPasses = 1;
         meta->setCountingPasses(true);
         meta->setPass(0);
diff --git a/openvdb/openvdb/Metadata.h b/openvdb/openvdb/Metadata.h
index 4a2d4d7463..8105c225ad 100644
--- a/openvdb/openvdb/Metadata.h
+++ b/openvdb/openvdb/Metadata.h
@@ -9,6 +9,7 @@
 #include "Types.h"
 #include "math/Math.h" // for math::isZero()
 #include "util/Name.h"
+#include "util/Assert.h"
 #include <cstdint>
 #include <iostream>
 #include <string>
@@ -279,9 +280,9 @@ TypedMetadata<T>::copy(const Metadata &other)
 
 template<typename T>
 inline void
-TypedMetadata<T>::readValue(std::istream& is, Index32 /*numBytes*/)
+TypedMetadata<T>::readValue(std::istream& is, [[maybe_unused]] Index32 numBytes)
 {
-    //assert(this->size() == numBytes);
+    OPENVDB_ASSERT(this->size() == numBytes);
     is.read(reinterpret_cast<char*>(&mValue), this->size());
 }
 
diff --git a/openvdb/openvdb/Types.h b/openvdb/openvdb/Types.h
index ad811e1520..d654750b7e 100644
--- a/openvdb/openvdb/Types.h
+++ b/openvdb/openvdb/Types.h
@@ -688,6 +688,23 @@ class Steal {};
 /// @brief Tag dispatch class that distinguishes constructors during file input
 class PartialCreate {};
 
+// For half compilation
+namespace math {
+template<>
+inline auto cwiseAdd(const math::Vec3<math::half>& v, const float s)
+{
+    math::Vec3<math::half> out;
+    const math::half* ip = v.asPointer();
+    math::half* op = out.asPointer();
+    for (unsigned i = 0; i < 3; ++i, ++op, ++ip) {
+        OPENVDB_NO_TYPE_CONVERSION_WARNING_BEGIN
+        *op = *ip + s;
+        OPENVDB_NO_TYPE_CONVERSION_WARNING_END
+    }
+    return out;
+}
+} // namespace math
+
 } // namespace OPENVDB_VERSION_NAME
 } // namespace openvdb
 
diff --git a/openvdb/openvdb/io/Compression.cc b/openvdb/openvdb/io/Compression.cc
index bdfba7b5cf..bfe1edcd87 100644
--- a/openvdb/openvdb/io/Compression.cc
+++ b/openvdb/openvdb/io/Compression.cc
@@ -4,6 +4,7 @@
 #include "Compression.h"
 
 #include <openvdb/Exceptions.h>
+#include <openvdb/util/Assert.h>
 #include <openvdb/util/logging.h>
 #ifdef OPENVDB_USE_ZLIB
 #include <zlib.h>
@@ -101,7 +102,7 @@ zipToStream(std::ostream& os, const char* data, size_t numBytes)
     } else {
         // Write the size of the uncompressed data.
         // numBytes expected to be <= the max value + 1 of a signed int64
-        assert(numBytes < size_t(std::numeric_limits<Int64>::max()));
+        OPENVDB_ASSERT(numBytes < size_t(std::numeric_limits<Int64>::max()));
         Int64 negBytes = -Int64(numBytes);
         os.write(reinterpret_cast<char*>(&negBytes), 8);
         // Write the uncompressed data.
@@ -230,7 +231,7 @@ bloscToStream(std::ostream& os, const char* data, size_t valSize, size_t numVals
 {
     const size_t inBytes = valSize * numVals;
     // inBytes expected to be <= the max value + 1 of a signed int64
-    assert(inBytes < size_t(std::numeric_limits<Int64>::max()));
+    OPENVDB_ASSERT(inBytes < size_t(std::numeric_limits<Int64>::max()));
 
     int outBytes = int(inBytes) + BLOSC_MAX_OVERHEAD;
     std::unique_ptr<char[]> compressedData(new char[outBytes]);
diff --git a/openvdb/openvdb/io/Compression.h b/openvdb/openvdb/io/Compression.h
index 0dc8cabe52..0316db0544 100644
--- a/openvdb/openvdb/io/Compression.h
+++ b/openvdb/openvdb/io/Compression.h
@@ -7,6 +7,7 @@
 #include <openvdb/Types.h>
 #include <openvdb/MetaMap.h>
 #include <openvdb/math/Math.h> // for negative()
+#include <openvdb/util/Assert.h>
 #include "io.h" // for getDataCompression(), etc.
 #include "DelayedLoadMetadata.h"
 #include <algorithm>
@@ -249,7 +250,7 @@ readData(std::istream& is, T* data, Index count, uint32_t compression,
 {
     const bool seek = data == nullptr;
     if (seek) {
-        assert(!getStreamMetadataPtr(is) || getStreamMetadataPtr(is)->seekable());
+        OPENVDB_ASSERT(!getStreamMetadataPtr(is) || getStreamMetadataPtr(is)->seekable());
     }
     const bool hasCompression = compression & (COMPRESS_BLOSC | COMPRESS_ZIP);
 
@@ -471,7 +472,7 @@ readCompressedValues(std::istream& is, ValueT* destBuf, Index destCount,
     const bool maskCompressed = compression & COMPRESS_ACTIVE_MASK;
 
     const bool seek = (destBuf == nullptr);
-    assert(!seek || (!meta || meta->seekable()));
+    OPENVDB_ASSERT(!seek || (!meta || meta->seekable()));
 
     // Get delayed load metadata if it exists
 
@@ -732,7 +733,7 @@ writeCompressedValues(std::ostream& os, ValueT* srcBuf, Index srcCount,
                         } // else inactive value 0
                     }
                 }
-                assert(tempCount == valueMask.countOn());
+                OPENVDB_ASSERT(tempCount == valueMask.countOn());
 
                 // Write out the mask that selects between two inactive values.
                 selectionMask.save(os);
diff --git a/openvdb/openvdb/io/DelayedLoadMetadata.cc b/openvdb/openvdb/io/DelayedLoadMetadata.cc
index 7387968370..29a783705a 100644
--- a/openvdb/openvdb/io/DelayedLoadMetadata.cc
+++ b/openvdb/openvdb/io/DelayedLoadMetadata.cc
@@ -4,6 +4,7 @@
 #include "DelayedLoadMetadata.h"
 
 #include <openvdb/points/StreamCompression.h>
+#include <openvdb/util/Assert.h>
 
 #ifdef OPENVDB_USE_BLOSC
 #include <blosc.h>
@@ -120,27 +121,27 @@ void DelayedLoadMetadata::resizeCompressedSize(size_t size)
 
 DelayedLoadMetadata::MaskType DelayedLoadMetadata::getMask(size_t index) const
 {
-    assert(DelayedLoadMetadata::isRegisteredType());
-    assert(index < mMask.size());
+    OPENVDB_ASSERT(DelayedLoadMetadata::isRegisteredType());
+    OPENVDB_ASSERT(index < mMask.size());
     return mMask[index];
 }
 
 void DelayedLoadMetadata::setMask(size_t index, const MaskType& value)
 {
-    assert(index < mMask.size());
+    OPENVDB_ASSERT(index < mMask.size());
     mMask[index] = value;
 }
 
 DelayedLoadMetadata::CompressedSizeType DelayedLoadMetadata::getCompressedSize(size_t index) const
 {
-    assert(DelayedLoadMetadata::isRegisteredType());
-    assert(index < mCompressedSize.size());
+    OPENVDB_ASSERT(DelayedLoadMetadata::isRegisteredType());
+    OPENVDB_ASSERT(index < mCompressedSize.size());
     return mCompressedSize[index];
 }
 
 void DelayedLoadMetadata::setCompressedSize(size_t index, const CompressedSizeType& value)
 {
-    assert(index < mCompressedSize.size());
+    OPENVDB_ASSERT(index < mCompressedSize.size());
     mCompressedSize[index] = value;
 }
 
@@ -174,7 +175,7 @@ void DelayedLoadMetadata::readValue(std::istream& is, Index32 numBytes)
         mMask.resize(count);
 
         // resize should never modify capacity for smaller vector sizes
-        assert(mMask.capacity() >= paddedCount);
+        OPENVDB_ASSERT(mMask.capacity() >= paddedCount);
 
         compression::bloscDecompress(reinterpret_cast<char*>(mMask.data()), count*sizeof(MaskType), mMask.capacity()*sizeof(MaskType), compressedBuffer.get());
 #endif
@@ -202,7 +203,7 @@ void DelayedLoadMetadata::readValue(std::istream& is, Index32 numBytes)
             mCompressedSize.resize(count);
 
             // resize should never modify capacity for smaller vector sizes
-            assert(mCompressedSize.capacity() >= paddedCount);
+            OPENVDB_ASSERT(mCompressedSize.capacity() >= paddedCount);
 
             compression::bloscDecompress(reinterpret_cast<char*>(mCompressedSize.data()), count*sizeof(CompressedSizeType), mCompressedSize.capacity()*sizeof(CompressedSizeType), compressedBuffer.get());
 #endif
@@ -231,12 +232,12 @@ void DelayedLoadMetadata::readValue(std::istream& is, Index32 numBytes)
 void DelayedLoadMetadata::writeValue(std::ostream& os) const
 {
     // metadata has a limit of 2^32 bytes
-    assert(mMask.size() < std::numeric_limits<Index32>::max());
-    assert(mCompressedSize.size() < std::numeric_limits<Index32>::max());
+    OPENVDB_ASSERT(mMask.size() < std::numeric_limits<Index32>::max());
+    OPENVDB_ASSERT(mCompressedSize.size() < std::numeric_limits<Index32>::max());
 
     if (mMask.empty() && mCompressedSize.empty())     return;
 
-    assert(mCompressedSize.empty() || (mMask.size() == mCompressedSize.size()));
+    OPENVDB_ASSERT(mCompressedSize.empty() || (mMask.size() == mCompressedSize.size()));
 
     Index32 count = static_cast<Index32>(mMask.size());
     os.write(reinterpret_cast<const char*>(&count), sizeof(Index32));
@@ -254,7 +255,7 @@ void DelayedLoadMetadata::writeValue(std::ostream& os) const
         }
 
         if (compressedBuffer) {
-            assert(compressedBytes < std::numeric_limits<Index32>::max());
+            OPENVDB_ASSERT(compressedBytes < std::numeric_limits<Index32>::max());
             Index32 bytes(static_cast<Index32>(compressedBytes));
             os.write(reinterpret_cast<const char*>(&bytes), sizeof(Index32));
             os.write(reinterpret_cast<const char*>(compressedBuffer.get()), compressedBytes);
@@ -281,7 +282,7 @@ void DelayedLoadMetadata::writeValue(std::ostream& os) const
         }
 
         if (compressedBuffer) {
-            assert(compressedBytes < std::numeric_limits<Index32>::max());
+            OPENVDB_ASSERT(compressedBytes < std::numeric_limits<Index32>::max());
             Index32 bytes(static_cast<Index32>(compressedBytes));
             os.write(reinterpret_cast<const char*>(&bytes), sizeof(Index32));
             os.write(reinterpret_cast<const char*>(compressedBuffer.get()), compressedBytes);
diff --git a/openvdb/openvdb/io/File.cc b/openvdb/openvdb/io/File.cc
index 7bd52afedc..723b94691c 100644
--- a/openvdb/openvdb/io/File.cc
+++ b/openvdb/openvdb/io/File.cc
@@ -8,6 +8,7 @@
 #include "TempFile.h"
 #include <openvdb/Exceptions.h>
 #include <openvdb/util/logging.h>
+#include <openvdb/util/Assert.h>
 #include <cstdint>
 
 #ifdef OPENVDB_USE_DELAYED_LOADING
@@ -20,7 +21,6 @@
 
 #include <sys/stat.h> // stat()
 
-#include <cassert>
 #include <cstdlib> // for getenv(), strtoul()
 #include <cstring> // for strerror_r()
 #include <fstream>
@@ -47,7 +47,7 @@ struct File::Impl
     static GridBase::Ptr readGrid(const File& file, const GridDescriptor& gd, const BoxType& bbox)
     {
         // This method should not be called for files that don't contain grid offsets.
-        assert(file.inputHasGridOffsets());
+        OPENVDB_ASSERT(file.inputHasGridOffsets());
 
         GridBase::Ptr grid = file.createGrid(gd);
         gd.seekToGrid(file.inputStream());
@@ -679,7 +679,7 @@ void
 File::readGridDescriptors(std::istream& is)
 {
     // This method should not be called for files that don't contain grid offsets.
-    assert(inputHasGridOffsets());
+    OPENVDB_ASSERT(inputHasGridOffsets());
 
     gridDescriptors().clear();
 
@@ -767,7 +767,7 @@ GridBase::ConstPtr
 File::readGridPartial(const GridDescriptor& gd, bool readTopology) const
 {
     // This method should not be called for files that don't contain grid offsets.
-    assert(inputHasGridOffsets());
+    OPENVDB_ASSERT(inputHasGridOffsets());
 
     GridBase::Ptr grid = createGrid(gd);
 
@@ -810,7 +810,7 @@ File::readGridPartial(GridBase::Ptr grid, std::istream& is,
     bool isInstance, bool readTopology) const
 {
     // This method should not be called for files that don't contain grid offsets.
-    assert(inputHasGridOffsets());
+    OPENVDB_ASSERT(inputHasGridOffsets());
 
     // This code needs to stay in sync with io::Archive::readGrid(), in terms of
     // the order of operations.
diff --git a/openvdb/openvdb/math/ConjGradient.h b/openvdb/openvdb/math/ConjGradient.h
index e922005445..2fd8bc4da2 100644
--- a/openvdb/openvdb/math/ConjGradient.h
+++ b/openvdb/openvdb/math/ConjGradient.h
@@ -12,12 +12,12 @@
 #include <openvdb/Exceptions.h>
 #include <openvdb/Types.h>
 #include <openvdb/util/logging.h>
+#include <openvdb/util/Assert.h>
 #include <openvdb/util/NullInterrupter.h>
 #include "Math.h" // for Abs(), isZero(), Max(), Sqrt()
 #include <tbb/parallel_for.h>
 #include <tbb/parallel_reduce.h>
 #include <algorithm> // for std::lower_bound()
-#include <cassert>
 #include <cmath> // for std::isfinite()
 #include <limits>
 #include <sstream>
@@ -663,7 +663,7 @@ template<typename T>
 inline T
 Vector<T>::dot(const Vector<T>& other) const
 {
-    assert(this->size() == other.size());
+    OPENVDB_ASSERT(this->size() == other.size());
 
     const T* aData = this->data();
     const T* bData = other.data();
@@ -874,7 +874,7 @@ inline void
 SparseStencilMatrix<ValueType, STENCIL_SIZE>::setValue(SizeType row, SizeType col,
     const ValueType& val)
 {
-    assert(row < mNumRows);
+    OPENVDB_ASSERT(row < mNumRows);
     this->getRowEditor(row).setValue(col, val);
 }
 
@@ -883,7 +883,7 @@ template<typename ValueType, SizeType STENCIL_SIZE>
 inline const ValueType&
 SparseStencilMatrix<ValueType, STENCIL_SIZE>::getValue(SizeType row, SizeType col) const
 {
-    assert(row < mNumRows);
+    OPENVDB_ASSERT(row < mNumRows);
     return this->getConstRow(row).getValue(col);
 }
 
@@ -1064,7 +1064,7 @@ template<typename ValueType, SizeType STENCIL_SIZE>
 inline typename SparseStencilMatrix<ValueType, STENCIL_SIZE>::RowEditor
 SparseStencilMatrix<ValueType, STENCIL_SIZE>::getRowEditor(SizeType i)
 {
-    assert(i < mNumRows);
+    OPENVDB_ASSERT(i < mNumRows);
     const SizeType head = i * STENCIL_SIZE;
     return RowEditor(&mValueArray[head], &mColumnIdxArray[head], mRowSizeArray[i], mNumRows);
 }
@@ -1074,7 +1074,7 @@ template<typename ValueType, SizeType STENCIL_SIZE>
 inline typename SparseStencilMatrix<ValueType, STENCIL_SIZE>::ConstRow
 SparseStencilMatrix<ValueType, STENCIL_SIZE>::getConstRow(SizeType i) const
 {
-    assert(i < mNumRows);
+    OPENVDB_ASSERT(i < mNumRows);
     const SizeType head = i * STENCIL_SIZE; // index for this row into main storage
     return ConstRow(&mValueArray[head], &mColumnIdxArray[head], mRowSizeArray[i]);
 }
@@ -1221,7 +1221,7 @@ inline SizeType
 SparseStencilMatrix<ValueType, STENCIL_SIZE>::RowEditor::setValue(
     SizeType column, const ValueType& value)
 {
-    assert(column < mNumColumns);
+    OPENVDB_ASSERT(column < mNumColumns);
 
     RowData& data = RowBase<>::mData;
 
@@ -1236,7 +1236,7 @@ SparseStencilMatrix<ValueType, STENCIL_SIZE>::RowEditor::setValue(
     }
 
     // Check that it is safe to add a new column.
-    assert(data.mSize < this->capacity());
+    OPENVDB_ASSERT(data.mSize < this->capacity());
 
     if (offset >= data.mSize) {
         // The new column's index is larger than any existing index.  Append the new column.
@@ -1297,8 +1297,8 @@ class JacobiPreconditioner: public Preconditioner<typename MatrixType::ValueType
     {
         const SizeType size = mDiag.size();
 
-        assert(r.size() == z.size());
-        assert(r.size() == size);
+        OPENVDB_ASSERT(r.size() == z.size());
+        OPENVDB_ASSERT(r.size() == size);
 
         tbb::parallel_for(SizeRange(0, size), ApplyOp(mDiag.data(), r.data(), z.data()));
     }
@@ -1314,7 +1314,7 @@ class JacobiPreconditioner: public Preconditioner<typename MatrixType::ValueType
         void operator()(const SizeRange& range) const {
             for (SizeType n = range.begin(), N = range.end(); n < N; ++n) {
                 const ValueType val = mat->getValue(n, n);
-                assert(!isApproxZero(val, ValueType(0.0001)));
+                OPENVDB_ASSERT(!isApproxZero(val, ValueType(0.0001)));
                 vec[n] = static_cast<ValueType>(1.0 / val);
             }
         }
@@ -1466,8 +1466,8 @@ class IncompleteCholeskyPreconditioner: public Preconditioner<typename MatrixTyp
 
         if (size == 0) return;
 
-        assert(rVec.size() == size);
-        assert(zVec.size() == size);
+        OPENVDB_ASSERT(rVec.size() == size);
+        OPENVDB_ASSERT(zVec.size() == size);
 
         // Allocate a temp vector
         mTempVec.fill(zeroVal<ValueType>());
@@ -1567,8 +1567,8 @@ template<typename T>
 inline void
 axpy(const T& a, const Vector<T>& xVec, const Vector<T>& yVec, Vector<T>& result)
 {
-    assert(xVec.size() == yVec.size());
-    assert(xVec.size() == result.size());
+    OPENVDB_ASSERT(xVec.size() == yVec.size());
+    OPENVDB_ASSERT(xVec.size() == result.size());
     axpy(a, xVec.data(), yVec.data(), result.data(), xVec.size());
 }
 
@@ -1590,9 +1590,9 @@ template<typename MatrixOperator, typename T>
 inline void
 computeResidual(const MatrixOperator& A, const Vector<T>& x, const Vector<T>& b, Vector<T>& r)
 {
-    assert(x.size() == b.size());
-    assert(x.size() == r.size());
-    assert(x.size() == A.numRows());
+    OPENVDB_ASSERT(x.size() == b.size());
+    OPENVDB_ASSERT(x.size() == r.size());
+    OPENVDB_ASSERT(x.size() == A.numRows());
 
     computeResidual(A, x.data(), b.data(), r.data());
 }
@@ -1664,7 +1664,7 @@ solve(
 
     internal::computeResidual(Amat, xVec, bVec, rVec);
 
-    assert(rVec.isFinite());
+    OPENVDB_ASSERT(rVec.isFinite());
 
     // Normalize the residual norm with the source norm and look for early out.
     result.absoluteError = static_cast<double>(rVec.infNorm());
@@ -1699,7 +1699,7 @@ solve(
 
         // <r,z>
         const ValueType rDotZ = rVec.dot(zVec);
-        assert(std::isfinite(rDotZ));
+        OPENVDB_ASSERT(std::isfinite(rDotZ));
 
         if (0 == iteration) {
             // Initialize
@@ -1715,7 +1715,7 @@ solve(
 
         // alpha = <r_{k-1}, z_{k-1}> / <p_{k},q_{k}>
         const ValueType pAp = pVec.dot(qVec);
-        assert(std::isfinite(pAp));
+        OPENVDB_ASSERT(std::isfinite(pAp));
 
         const ValueType alpha = rDotZ / pAp;
         rDotZPrev = rDotZ;
diff --git a/openvdb/openvdb/math/Coord.h b/openvdb/openvdb/math/Coord.h
index 00589fa598..154a3743e5 100644
--- a/openvdb/openvdb/math/Coord.h
+++ b/openvdb/openvdb/math/Coord.h
@@ -10,6 +10,7 @@
 #include <iostream>
 #include <limits>
 #include <openvdb/Platform.h>
+#include <openvdb/util/Assert.h>
 #include "Math.h"
 #include "Vec3.h"
 
@@ -130,11 +131,11 @@ class Coord
     Int32 x() const { return mVec[0]; }
     Int32 y() const { return mVec[1]; }
     Int32 z() const { return mVec[2]; }
-    Int32 operator[](size_t i) const { assert(i < 3); return mVec[i]; }
+    Int32 operator[](size_t i) const { OPENVDB_ASSERT(i < 3); return mVec[i]; }
     Int32& x() { return mVec[0]; }
     Int32& y() { return mVec[1]; }
     Int32& z() { return mVec[2]; }
-    Int32& operator[](size_t i) { assert(i < 3); return mVec[i]; }
+    Int32& operator[](size_t i) { OPENVDB_ASSERT(i < 3); return mVec[i]; }
 
     const Int32* data() const { return mVec.data(); }
     Int32* data() { return mVec.data(); }
@@ -306,7 +307,7 @@ class CoordBBox
     /// @note The other bounding box is assumed to be divisible.
     CoordBBox(CoordBBox& other, const tbb::split&): mMin(other.mMin), mMax(other.mMax)
     {
-        assert(this->is_divisible());
+        OPENVDB_ASSERT(this->is_divisible());
         const size_t n = this->maxExtent();
         mMax[n] = (mMin[n] + mMax[n]) >> 1;
         other.mMin[n] = mMax[n] + 1;
@@ -471,7 +472,7 @@ class CoordBBox
     /// least seven times, i.e. has storage for eight Coord elements!
     void getCornerPoints(Coord *p) const
     {
-        assert(p != nullptr);
+        OPENVDB_ASSERT(p != nullptr);
         p->reset(mMin.x(), mMin.y(), mMin.z()); ++p;
         p->reset(mMin.x(), mMin.y(), mMax.z()); ++p;
         p->reset(mMin.x(), mMax.y(), mMin.z()); ++p;
diff --git a/openvdb/openvdb/math/DDA.h b/openvdb/openvdb/math/DDA.h
index 908912cf46..ea034c69d5 100644
--- a/openvdb/openvdb/math/DDA.h
+++ b/openvdb/openvdb/math/DDA.h
@@ -14,6 +14,7 @@
 #include "Math.h"
 #include "Vec3.h"
 #include <openvdb/Types.h>
+#include <openvdb/util/Assert.h>
 #include <iostream> // for std::ostream
 #include <limits> // for std::numeric_limits<Type>::max()
 
@@ -50,7 +51,7 @@ class DDA
 
     inline void init(const RayT& ray, RealT startTime, RealT maxTime)
     {
-        assert(startTime <= maxTime);
+        OPENVDB_ASSERT(startTime <= maxTime);
         static const int DIM = 1 << Log2Dim;
         mT0 = startTime;
         mT1 = maxTime;
diff --git a/openvdb/openvdb/math/Half.h b/openvdb/openvdb/math/Half.h
index b8045ba441..ba72b0e6df 100644
--- a/openvdb/openvdb/math/Half.h
+++ b/openvdb/openvdb/math/Half.h
@@ -342,8 +342,14 @@ imath_half_to_float (imath_half_bits_t h)
         // other compilers may provide count-leading-zeros primitives,
         // but we need the community to inform us of the variants
         uint32_t lc;
-#    if defined(_MSC_VER) && (_M_IX86 || _M_X64)
-        lc = __lzcnt (hexpmant);
+#    if defined(_MSC_VER)
+        // The direct intrinsic for this is __lznct, but that is not supported
+        // on older x86_64 hardware or ARM. Instead uses the bsr instruction
+        // and one additional subtraction. This assumes hexpmant != 0, for 0
+        // bsr and lznct would behave differently.
+        unsigned long bsr;
+        _BitScanReverse (&bsr, hexpmant);
+        lc = (31 - bsr);
 #    elif defined(__GNUC__) || defined(__clang__)
         lc = (uint32_t) __builtin_clz (hexpmant);
 #    else
diff --git a/openvdb/openvdb/math/Mat3.h b/openvdb/openvdb/math/Mat3.h
index 6fe21bbc94..a503b22ca9 100644
--- a/openvdb/openvdb/math/Mat3.h
+++ b/openvdb/openvdb/math/Mat3.h
@@ -5,10 +5,10 @@
 #define OPENVDB_MATH_MAT3_H_HAS_BEEN_INCLUDED
 
 #include <openvdb/Exceptions.h>
+#include <openvdb/util/Assert.h>
 #include "Vec3.h"
 #include "Mat.h"
 #include <algorithm> // for std::copy()
-#include <cassert>
 #include <cmath>
 #include <iomanip>
 
@@ -140,7 +140,7 @@ class Mat3: public Mat<3, T>
     /// Set ith row to vector v
     void setRow(int i, const Vec3<T> &v)
     {
-        // assert(i>=0 && i<3);
+        OPENVDB_ASSERT(i>=0 && i<3);
         int i3 = i * 3;
 
         MyBase::mm[i3+0] = v[0];
@@ -151,14 +151,14 @@ class Mat3: public Mat<3, T>
     /// Get ith row, e.g.    Vec3d v = m.row(1);
     Vec3<T> row(int i) const
     {
-        // assert(i>=0 && i<3);
+        OPENVDB_ASSERT(i>=0 && i<3);
         return Vec3<T>((*this)(i,0), (*this)(i,1), (*this)(i,2));
     } // rowColumnTest
 
     /// Set jth column to vector v
     void setCol(int j, const Vec3<T>& v)
     {
-        // assert(j>=0 && j<3);
+        OPENVDB_ASSERT(j>=0 && j<3);
         MyBase::mm[0+j] = v[0];
         MyBase::mm[3+j] = v[1];
         MyBase::mm[6+j] = v[2];
@@ -167,7 +167,7 @@ class Mat3: public Mat<3, T>
     /// Get jth column, e.g.    Vec3d v = m.col(0);
     Vec3<T> col(int j) const
     {
-        // assert(j>=0 && j<3);
+        OPENVDB_ASSERT(j>=0 && j<3);
         return Vec3<T>((*this)(0,j), (*this)(1,j), (*this)(2,j));
     } // rowColumnTest
 
@@ -176,8 +176,8 @@ class Mat3: public Mat<3, T>
     /// e.g.    m(0,0) = 1;
     T& operator()(int i, int j)
     {
-        // assert(i>=0 && i<3);
-        // assert(j>=0 && j<3);
+        OPENVDB_ASSERT(i>=0 && i<3);
+        OPENVDB_ASSERT(j>=0 && j<3);
         return MyBase::mm[3*i+j];
     } // trivial
 
@@ -186,8 +186,8 @@ class Mat3: public Mat<3, T>
     /// e.g.    float f = m(1,0);
     T operator()(int i, int j) const
     {
-        // assert(i>=0 && i<3);
-        // assert(j>=0 && j<3);
+        OPENVDB_ASSERT(i>=0 && i<3);
+        OPENVDB_ASSERT(j>=0 && j<3);
         return MyBase::mm[3*i+j];
     } // trivial
 
diff --git a/openvdb/openvdb/math/Mat4.h b/openvdb/openvdb/math/Mat4.h
index 33b5017d0e..7a3db1bfed 100644
--- a/openvdb/openvdb/math/Mat4.h
+++ b/openvdb/openvdb/math/Mat4.h
@@ -6,12 +6,12 @@
 
 #include <openvdb/Exceptions.h>
 #include <openvdb/Platform.h>
+#include <openvdb/util/Assert.h>
 #include "Math.h"
 #include "Mat3.h"
 #include "Vec3.h"
 #include "Vec4.h"
 #include <algorithm> // for std::copy(), std::swap()
-#include <cassert>
 #include <iomanip>
 #include <cmath>
 
@@ -138,7 +138,7 @@ class Mat4: public Mat<4, T>
     /// Set ith row to vector v
     void setRow(int i, const Vec4<T> &v)
     {
-        // assert(i>=0 && i<4);
+        OPENVDB_ASSERT(i>=0 && i<4);
         int i4 = i * 4;
         MyBase::mm[i4+0] = v[0];
         MyBase::mm[i4+1] = v[1];
@@ -149,14 +149,14 @@ class Mat4: public Mat<4, T>
     /// Get ith row, e.g.    Vec4f v = m.row(1);
     Vec4<T> row(int i) const
     {
-        // assert(i>=0 && i<3);
+        OPENVDB_ASSERT(i>=0 && i<4);
         return Vec4<T>((*this)(i,0), (*this)(i,1), (*this)(i,2), (*this)(i,3));
     }
 
     /// Set jth column to vector v
     void setCol(int j, const Vec4<T>& v)
     {
-        // assert(j>=0 && j<4);
+        OPENVDB_ASSERT(j>=0 && j<4);
         MyBase::mm[ 0+j] = v[0];
         MyBase::mm[ 4+j] = v[1];
         MyBase::mm[ 8+j] = v[2];
@@ -166,7 +166,7 @@ class Mat4: public Mat<4, T>
     /// Get jth column, e.g.    Vec4f v = m.col(0);
     Vec4<T> col(int j) const
     {
-        // assert(j>=0 && j<4);
+        OPENVDB_ASSERT(j>=0 && j<4);
         return Vec4<T>((*this)(0,j), (*this)(1,j), (*this)(2,j), (*this)(3,j));
     }
 
@@ -175,8 +175,8 @@ class Mat4: public Mat<4, T>
     /// e.g.    m(0,0) = 1;
     T& operator()(int i, int j)
     {
-        // assert(i>=0 && i<4);
-        // assert(j>=0 && j<4);
+        OPENVDB_ASSERT(i>=0 && i<4);
+        OPENVDB_ASSERT(j>=0 && j<4);
         return MyBase::mm[4*i+j];
     }
 
@@ -185,8 +185,8 @@ class Mat4: public Mat<4, T>
     /// e.g.    float f = m(1,0);
     T operator()(int i, int j) const
     {
-        // assert(i>=0 && i<4);
-        // assert(j>=0 && j<4);
+        OPENVDB_ASSERT(i>=0 && i<4);
+        OPENVDB_ASSERT(j>=0 && j<4);
         return MyBase::mm[4*i+j];
     }
 
@@ -791,83 +791,85 @@ class Mat4: public Mat<4, T>
     /// product of v1 and v2.
     void setToRotation(const Vec3<T>& v1, const Vec3<T>& v2) {*this = rotation<Mat4<T> >(v1, v2);}
 
-
     /// @brief Left multiplies by a rotation clock-wiseabout the given axis into this matrix.
     /// @param axis The axis (one of X, Y, Z) of rotation.
     /// @param angle The clock-wise rotation angle, in radians.
     void preRotate(Axis axis, T angle)
     {
+        OPENVDB_ASSERT(axis==X_AXIS || axis==Y_AXIS || axis==Z_AXIS);
+
         T c = static_cast<T>(cos(angle));
         T s = -static_cast<T>(sin(angle)); // the "-" makes it clockwise
 
-        switch (axis) {
-        case X_AXIS:
-            {
-                T a4, a5, a6, a7;
-
-                a4 = c * MyBase::mm[ 4] - s * MyBase::mm[ 8];
-                a5 = c * MyBase::mm[ 5] - s * MyBase::mm[ 9];
-                a6 = c * MyBase::mm[ 6] - s * MyBase::mm[10];
-                a7 = c * MyBase::mm[ 7] - s * MyBase::mm[11];
+        switch (axis)
+        {
+            case X_AXIS:
+                {
+                    T a4, a5, a6, a7;
 
+                    a4 = c * MyBase::mm[ 4] - s * MyBase::mm[ 8];
+                    a5 = c * MyBase::mm[ 5] - s * MyBase::mm[ 9];
+                    a6 = c * MyBase::mm[ 6] - s * MyBase::mm[10];
+                    a7 = c * MyBase::mm[ 7] - s * MyBase::mm[11];
 
-                MyBase::mm[ 8] = s * MyBase::mm[ 4] + c * MyBase::mm[ 8];
-                MyBase::mm[ 9] = s * MyBase::mm[ 5] + c * MyBase::mm[ 9];
-                MyBase::mm[10] = s * MyBase::mm[ 6] + c * MyBase::mm[10];
-                MyBase::mm[11] = s * MyBase::mm[ 7] + c * MyBase::mm[11];
 
-                MyBase::mm[ 4] = a4;
-                MyBase::mm[ 5] = a5;
-                MyBase::mm[ 6] = a6;
-                MyBase::mm[ 7] = a7;
-            }
-            break;
+                    MyBase::mm[ 8] = s * MyBase::mm[ 4] + c * MyBase::mm[ 8];
+                    MyBase::mm[ 9] = s * MyBase::mm[ 5] + c * MyBase::mm[ 9];
+                    MyBase::mm[10] = s * MyBase::mm[ 6] + c * MyBase::mm[10];
+                    MyBase::mm[11] = s * MyBase::mm[ 7] + c * MyBase::mm[11];
 
-        case Y_AXIS:
-            {
-                T a0, a1, a2, a3;
+                    MyBase::mm[ 4] = a4;
+                    MyBase::mm[ 5] = a5;
+                    MyBase::mm[ 6] = a6;
+                    MyBase::mm[ 7] = a7;
+                }
+                break;
 
-                a0 = c * MyBase::mm[ 0] + s * MyBase::mm[ 8];
-                a1 = c * MyBase::mm[ 1] + s * MyBase::mm[ 9];
-                a2 = c * MyBase::mm[ 2] + s * MyBase::mm[10];
-                a3 = c * MyBase::mm[ 3] + s * MyBase::mm[11];
+            case Y_AXIS:
+                {
+                    T a0, a1, a2, a3;
 
-                MyBase::mm[ 8] = -s * MyBase::mm[ 0] + c * MyBase::mm[ 8];
-                MyBase::mm[ 9] = -s * MyBase::mm[ 1] + c * MyBase::mm[ 9];
-                MyBase::mm[10] = -s * MyBase::mm[ 2] + c * MyBase::mm[10];
-                MyBase::mm[11] = -s * MyBase::mm[ 3] + c * MyBase::mm[11];
+                    a0 = c * MyBase::mm[ 0] + s * MyBase::mm[ 8];
+                    a1 = c * MyBase::mm[ 1] + s * MyBase::mm[ 9];
+                    a2 = c * MyBase::mm[ 2] + s * MyBase::mm[10];
+                    a3 = c * MyBase::mm[ 3] + s * MyBase::mm[11];
 
+                    MyBase::mm[ 8] = -s * MyBase::mm[ 0] + c * MyBase::mm[ 8];
+                    MyBase::mm[ 9] = -s * MyBase::mm[ 1] + c * MyBase::mm[ 9];
+                    MyBase::mm[10] = -s * MyBase::mm[ 2] + c * MyBase::mm[10];
+                    MyBase::mm[11] = -s * MyBase::mm[ 3] + c * MyBase::mm[11];
 
-                MyBase::mm[ 0] = a0;
-                MyBase::mm[ 1] = a1;
-                MyBase::mm[ 2] = a2;
-                MyBase::mm[ 3] = a3;
-            }
-            break;
 
-        case Z_AXIS:
-            {
-                T a0, a1, a2, a3;
-
-                a0 = c * MyBase::mm[ 0] - s * MyBase::mm[ 4];
-                a1 = c * MyBase::mm[ 1] - s * MyBase::mm[ 5];
-                a2 = c * MyBase::mm[ 2] - s * MyBase::mm[ 6];
-                a3 = c * MyBase::mm[ 3] - s * MyBase::mm[ 7];
-
-                MyBase::mm[ 4] = s * MyBase::mm[ 0] + c * MyBase::mm[ 4];
-                MyBase::mm[ 5] = s * MyBase::mm[ 1] + c * MyBase::mm[ 5];
-                MyBase::mm[ 6] = s * MyBase::mm[ 2] + c * MyBase::mm[ 6];
-                MyBase::mm[ 7] = s * MyBase::mm[ 3] + c * MyBase::mm[ 7];
-
-                MyBase::mm[ 0] = a0;
-                MyBase::mm[ 1] = a1;
-                MyBase::mm[ 2] = a2;
-                MyBase::mm[ 3] = a3;
-            }
-            break;
+                    MyBase::mm[ 0] = a0;
+                    MyBase::mm[ 1] = a1;
+                    MyBase::mm[ 2] = a2;
+                    MyBase::mm[ 3] = a3;
+                }
+                break;
+
+            case Z_AXIS:
+                {
+                    T a0, a1, a2, a3;
+
+                    a0 = c * MyBase::mm[ 0] - s * MyBase::mm[ 4];
+                    a1 = c * MyBase::mm[ 1] - s * MyBase::mm[ 5];
+                    a2 = c * MyBase::mm[ 2] - s * MyBase::mm[ 6];
+                    a3 = c * MyBase::mm[ 3] - s * MyBase::mm[ 7];
+
+                    MyBase::mm[ 4] = s * MyBase::mm[ 0] + c * MyBase::mm[ 4];
+                    MyBase::mm[ 5] = s * MyBase::mm[ 1] + c * MyBase::mm[ 5];
+                    MyBase::mm[ 6] = s * MyBase::mm[ 2] + c * MyBase::mm[ 6];
+                    MyBase::mm[ 7] = s * MyBase::mm[ 3] + c * MyBase::mm[ 7];
+
+                    MyBase::mm[ 0] = a0;
+                    MyBase::mm[ 1] = a1;
+                    MyBase::mm[ 2] = a2;
+                    MyBase::mm[ 3] = a3;
+                }
+                break;
 
         default:
-            assert(axis==X_AXIS || axis==Y_AXIS || axis==Z_AXIS);
+            OPENVDB_ASSERT(axis==X_AXIS || axis==Y_AXIS || axis==Z_AXIS);
         }
     }
 
@@ -880,10 +882,9 @@ class Mat4: public Mat<4, T>
         T c = static_cast<T>(cos(angle));
         T s = -static_cast<T>(sin(angle)); // the "-" makes it clockwise
 
-
-
-        switch (axis) {
-        case X_AXIS:
+        switch (axis)
+        {
+            case X_AXIS:
             {
                 T a2, a6, a10, a14;
 
@@ -905,7 +906,7 @@ class Mat4: public Mat<4, T>
             }
             break;
 
-        case Y_AXIS:
+            case Y_AXIS:
             {
                 T a2, a6, a10, a14;
 
@@ -926,7 +927,7 @@ class Mat4: public Mat<4, T>
             }
             break;
 
-        case Z_AXIS:
+            case Z_AXIS:
             {
                 T a1, a5, a9, a13;
 
@@ -948,8 +949,8 @@ class Mat4: public Mat<4, T>
             }
             break;
 
-        default:
-            assert(axis==X_AXIS || axis==Y_AXIS || axis==Z_AXIS);
+            default:
+                OPENVDB_ASSERT(axis==X_AXIS || axis==Y_AXIS || axis==Z_AXIS);
         }
     }
 
diff --git a/openvdb/openvdb/math/Math.h b/openvdb/openvdb/math/Math.h
index 1406774620..60d02c5c0e 100644
--- a/openvdb/openvdb/math/Math.h
+++ b/openvdb/openvdb/math/Math.h
@@ -10,6 +10,7 @@
 
 #include <openvdb/Platform.h>
 #include <openvdb/version.h>
+#include <openvdb/util/Assert.h>
 #include <algorithm> // for std::max()
 #include <cassert>
 #include <cmath>     // for std::ceil(), std::fabs(), std::pow(), std::sqrt(), etc.
@@ -259,7 +260,7 @@ template<typename Type>
 inline Type
 Clamp(Type x, Type min, Type max)
 {
-    assert( !(min>max) );
+    OPENVDB_ASSERT( !(min>max) );
     return x > min ? x < max ? x : max : min;
 }
 
@@ -294,7 +295,7 @@ template<typename Type>
 inline Type
 SmoothUnitStep(Type x, Type min, Type max)
 {
-    assert(min < max);
+    OPENVDB_ASSERT(min < max);
     return SmoothUnitStep((x-min)/(max-min));
 }
 
@@ -573,14 +574,14 @@ Pow(Type x, int n)
 inline float
 Pow(float b, float e)
 {
-    assert( b >= 0.0f && "Pow(float,float): base is negative" );
+    OPENVDB_ASSERT( b >= 0.0f && "Pow(float,float): base is negative" );
     return powf(b,e);
 }
 
 inline double
 Pow(double b, double e)
 {
-    assert( b >= 0.0 && "Pow(double,double): base is negative" );
+    OPENVDB_ASSERT( b >= 0.0 && "Pow(double,double): base is negative" );
     return std::pow(b,e);
 }
 //@}
@@ -892,7 +893,7 @@ template<typename Type>
 inline Type
 Inv(Type x)
 {
-    assert(x);
+    OPENVDB_ASSERT(x);
     return Type(1)/x;
 }
 
diff --git a/openvdb/openvdb/math/Quat.h b/openvdb/openvdb/math/Quat.h
index 554cc8b6c7..d9db902aac 100644
--- a/openvdb/openvdb/math/Quat.h
+++ b/openvdb/openvdb/math/Quat.h
@@ -9,6 +9,7 @@
 #include "Math.h"
 #include "Vec3.h"
 #include <openvdb/Exceptions.h>
+#include <openvdb/util/Assert.h>
 #include <cmath>
 #include <iostream>
 #include <sstream>
@@ -111,7 +112,7 @@ class Quat
     /// unit vector
     Quat(const Vec3<T> &axis, T angle)
     {
-        // assert( REL_EQ(axis.length(), 1.) );
+        OPENVDB_ASSERT(isApproxEqual(axis.length(), T(1)));
 
         T s = T(sin(angle*T(0.5)));
 
diff --git a/openvdb/openvdb/math/Ray.h b/openvdb/openvdb/math/Ray.h
index e0d92b01d8..260ec478d3 100644
--- a/openvdb/openvdb/math/Ray.h
+++ b/openvdb/openvdb/math/Ray.h
@@ -13,6 +13,7 @@
 #include "Math.h"
 #include "Vec3.h"
 #include "Transform.h"
+#include <openvdb/util/Assert.h>
 #include <algorithm> // for std::swap()
 #include <iostream> // for std::ostream
 #include <limits> // for std::numeric_limits<Type>::max()
@@ -48,7 +49,7 @@ class Ray
         /// @brief Return the midpoint of the ray.
         inline RealT mid() const { return 0.5*(t0 + t1); }
         /// @brief Multiplies both times
-        inline void scale(RealT s) {assert(s>0); t0*=s; t1*=s; }
+        inline void scale(RealT s) {OPENVDB_ASSERT(s>0); t0*=s; t1*=s; }
         /// @brief Return @c true if time is inclusive
         inline bool test(RealT t) const { return (t>=t0 && t<=t1); }
     };
@@ -69,15 +70,15 @@ class Ray
         mInvDir = 1/mDir;
     }
 
-    inline void setMinTime(RealT t0) { assert(t0>0); mTimeSpan.t0 = t0; }
+    inline void setMinTime(RealT t0) { OPENVDB_ASSERT(t0>0); mTimeSpan.t0 = t0; }
 
-    inline void setMaxTime(RealT t1) { assert(t1>0); mTimeSpan.t1 = t1; }
+    inline void setMaxTime(RealT t1) { OPENVDB_ASSERT(t1>0); mTimeSpan.t1 = t1; }
 
     inline void setTimes(
         RealT t0 = math::Delta<RealT>::value(),
         RealT t1 = std::numeric_limits<RealT>::max())
     {
-        assert(t0>0 && t1>0);
+        OPENVDB_ASSERT(t0>0 && t1>0);
         mTimeSpan.set(t0, t1);
     }
 
@@ -131,8 +132,8 @@ class Ray
     template<typename MapType>
     inline Ray applyMap(const MapType& map) const
     {
-        assert(map.isLinear());
-        assert(math::isRelOrApproxEqual(mDir.length(), RealT(1),
+        OPENVDB_ASSERT(map.isLinear());
+        OPENVDB_ASSERT(math::isRelOrApproxEqual(mDir.length(), RealT(1),
             Tolerance<RealT>::value(), Delta<RealT>::value()));
         const Vec3T eye = map.applyMap(mEye);
         const Vec3T dir = map.applyJacobian(mDir);
@@ -149,8 +150,8 @@ class Ray
     template<typename MapType>
     inline Ray applyInverseMap(const MapType& map) const
     {
-        assert(map.isLinear());
-        assert(math::isRelOrApproxEqual(mDir.length(), RealT(1), Tolerance<RealT>::value(), Delta<RealT>::value()));
+        OPENVDB_ASSERT(map.isLinear());
+        OPENVDB_ASSERT(math::isRelOrApproxEqual(mDir.length(), RealT(1), Tolerance<RealT>::value(), Delta<RealT>::value()));
         const Vec3T eye = map.applyInverseMap(mEye);
         const Vec3T dir = map.applyInverseJacobian(mDir);
         const RealT length = dir.length();
diff --git a/openvdb/openvdb/math/Stats.h b/openvdb/openvdb/math/Stats.h
index 693af25a5a..61ff1a8cc3 100644
--- a/openvdb/openvdb/math/Stats.h
+++ b/openvdb/openvdb/math/Stats.h
@@ -13,6 +13,7 @@
 #include <iosfwd> // for ostringstream
 #include <openvdb/version.h>
 #include <openvdb/Exceptions.h>
+#include <openvdb/util/Assert.h>
 #include <iostream>
 #include <iomanip>
 #include <sstream>
@@ -156,7 +157,7 @@ class Extrema
 
     inline void join(const Extrema& other)
     {
-        assert(other.mSize > 0);
+        OPENVDB_ASSERT(other.mSize > 0);
         mSize += other.mSize;
         mMin   = std::min<double>(mMin, other.mMin);
         mMax   = std::max<double>(mMax, other.mMax);
diff --git a/openvdb/openvdb/math/Stencils.h b/openvdb/openvdb/math/Stencils.h
index ce6243af2e..58d96d51d4 100644
--- a/openvdb/openvdb/math/Stencils.h
+++ b/openvdb/openvdb/math/Stencils.h
@@ -96,7 +96,7 @@ class BaseStencil
     /// which is typically the center point of the stencil.
     inline const ValueType& getValue(unsigned int pos = 0) const
     {
-        assert(pos < mValues.size());
+        OPENVDB_ASSERT(pos < mValues.size());
         return mValues[pos];
     }
 
@@ -121,7 +121,7 @@ class BaseStencil
     inline ValueType median() const
     {
         BufferType tmp(mValues);//local copy
-        assert(!tmp.empty());
+        OPENVDB_ASSERT(!tmp.empty());
         size_t midpoint = (tmp.size() - 1) >> 1;
         // Partially sort the vector until the median value is at the midpoint.
 #if !defined(_MSC_VER) || _MSC_VER < 1924
@@ -343,9 +343,9 @@ class BoxStencil: public BaseStencil<BoxStencil<GridT, IsSafe>, GridT, IsSafe>
         const ValueType w = xyz[2] - BaseType::mCenter[2];
         OPENVDB_NO_TYPE_CONVERSION_WARNING_END
 
-        assert(u>=0 && u<=1);
-        assert(v>=0 && v<=1);
-        assert(w>=0 && w<=1);
+        OPENVDB_ASSERT(u>=0 && u<=1);
+        OPENVDB_ASSERT(v>=0 && v<=1);
+        OPENVDB_ASSERT(w>=0 && w<=1);
 
         ValueType V = BaseType::template getValue<0,0,0>();
         ValueType A = static_cast<ValueType>(V + (BaseType::template getValue<0,0,1>() - V) * w);
@@ -377,9 +377,9 @@ class BoxStencil: public BaseStencil<BoxStencil<GridT, IsSafe>, GridT, IsSafe>
         const ValueType w = xyz[2] - BaseType::mCenter[2];
         OPENVDB_NO_TYPE_CONVERSION_WARNING_END
 
-        assert(u>=0 && u<=1);
-        assert(v>=0 && v<=1);
-        assert(w>=0 && w<=1);
+        OPENVDB_ASSERT(u>=0 && u<=1);
+        OPENVDB_ASSERT(v>=0 && v<=1);
+        OPENVDB_ASSERT(w>=0 && w<=1);
 
         ValueType D[4]={BaseType::template getValue<0,0,1>()-BaseType::template getValue<0,0,0>(),
                         BaseType::template getValue<0,1,1>()-BaseType::template getValue<0,1,0>(),
@@ -1774,7 +1774,7 @@ class DenseStencil: public BaseStencil<DenseStencil<GridT, IsSafe>, GridT, IsSaf
         : BaseType(grid, /*size=*/math::Pow3(2 * halfWidth + 1))
         , mHalfWidth(halfWidth)
     {
-        assert(halfWidth>0);
+        OPENVDB_ASSERT(halfWidth>0);
     }
 
     inline const ValueType& getCenterValue() const { return mValues[(mValues.size()-1)>>1]; }
diff --git a/openvdb/openvdb/math/Tuple.h b/openvdb/openvdb/math/Tuple.h
index c520b0ecb1..e737ff6ec8 100644
--- a/openvdb/openvdb/math/Tuple.h
+++ b/openvdb/openvdb/math/Tuple.h
@@ -8,6 +8,7 @@
 #define OPENVDB_MATH_TUPLE_HAS_BEEN_INCLUDED
 
 #include "Math.h"
+#include <openvdb/util/Assert.h>
 #include <cmath>
 #include <sstream>
 #include <string>
@@ -62,7 +63,7 @@ class Tuple
     template <typename IdxT,
         typename std::enable_if<std::is_integral<IdxT>::value, bool>::type = true>
     T operator[](IdxT i) const {
-        assert(i >= IdxT(0) && i < IdxT(SIZE));
+        OPENVDB_ASSERT(i >= IdxT(0) && i < IdxT(SIZE));
         return mm[i];
     }
 
@@ -71,7 +72,7 @@ class Tuple
     template <typename IdxT,
         typename std::enable_if<std::is_integral<IdxT>::value, bool>::type = true>
     T& operator[](IdxT i) {
-        assert(i >= IdxT(0) && i < IdxT(SIZE));
+        OPENVDB_ASSERT(i >= IdxT(0) && i < IdxT(SIZE));
         return mm[i];
     }
 
diff --git a/openvdb/openvdb/math/Vec3.h b/openvdb/openvdb/math/Vec3.h
index aaeecee675..2ea5d32838 100644
--- a/openvdb/openvdb/math/Vec3.h
+++ b/openvdb/openvdb/math/Vec3.h
@@ -5,6 +5,7 @@
 #define OPENVDB_MATH_VEC3_HAS_BEEN_INCLUDED
 
 #include <openvdb/Exceptions.h>
+#include <openvdb/util/Assert.h>
 #include "Math.h"
 #include "Tuple.h"
 #include <algorithm>
@@ -228,8 +229,8 @@ class Vec3: public Tuple<3, T>
     /// this = v1 cross v2, v1 and v2 must be distinct objects than "this"
     const Vec3<T>& cross(const Vec3<T> &v1, const Vec3<T> &v2)
     {
-        // assert(this!=&v1);
-        // assert(this!=&v2);
+        OPENVDB_ASSERT(this!=&v1);
+        OPENVDB_ASSERT(this!=&v2);
         this->mm[0] = v1.mm[1]*v2.mm[2] - v1.mm[2]*v2.mm[1];
         this->mm[1] = v1.mm[2]*v2.mm[0] - v1.mm[0]*v2.mm[2];
         this->mm[2] = v1.mm[0]*v2.mm[1] - v1.mm[1]*v2.mm[0];
diff --git a/openvdb/openvdb/points/AttributeArray.h b/openvdb/openvdb/points/AttributeArray.h
index d767801fb0..25ebbe8079 100644
--- a/openvdb/openvdb/points/AttributeArray.h
+++ b/openvdb/openvdb/points/AttributeArray.h
@@ -14,6 +14,7 @@
 #include <openvdb/math/QuantizedUnitVec.h>
 #include <openvdb/util/Name.h>
 #include <openvdb/util/logging.h>
+#include <openvdb/util/Assert.h>
 #include <openvdb/io/io.h> // MappedFile
 #include <openvdb/io/Compression.h> // COMPRESS_BLOSC
 
@@ -774,8 +775,8 @@ class TypedAttributeArray final: public AttributeArray
     AccessorBasePtr getAccessor() const override;
 
     /// Return the raw data buffer
-    inline StorageType* data() { assert(validData()); return mData.get(); }
-    inline const StorageType* data() const { assert(validData()); return mData.get(); }
+    inline StorageType* data() { OPENVDB_ASSERT(validData()); return mData.get(); }
+    inline const StorageType* data() const { OPENVDB_ASSERT(validData()); return mData.get(); }
 
     /// Verify that data is not out-of-core or in a partially-read state
     inline bool validData() const { return !(isOutOfCore() || (flags() & PARTIALREAD)); }
@@ -1033,17 +1034,17 @@ void AttributeArray::doCopyValues(const AttributeArray& sourceArray, const IterT
     bool rangeChecking/*=true*/)
 {
     // ensure both arrays have float-float or integer-integer value types
-    assert(sourceArray.valueTypeIsFloatingPoint() == this->valueTypeIsFloatingPoint());
+    OPENVDB_ASSERT(sourceArray.valueTypeIsFloatingPoint() == this->valueTypeIsFloatingPoint());
     // ensure both arrays have been loaded from disk (if delay-loaded)
-    assert(sourceArray.isDataLoaded() && this->isDataLoaded());
+    OPENVDB_ASSERT(sourceArray.isDataLoaded() && this->isDataLoaded());
     // ensure storage size * stride matches on both arrays
-    assert(this->storageTypeSize()*this->stride() ==
+    OPENVDB_ASSERT(this->storageTypeSize()*this->stride() ==
         sourceArray.storageTypeSize()*sourceArray.stride());
 
     const size_t bytes(sourceArray.storageTypeSize()*sourceArray.stride());
     const char* const sourceBuffer = sourceArray.dataAsByteArray();
     char* const targetBuffer = this->dataAsByteArray();
-    assert(sourceBuffer && targetBuffer);
+    OPENVDB_ASSERT(sourceBuffer && targetBuffer);
 
     if (rangeChecking && this->isUniform()) {
         OPENVDB_THROW(IndexError, "Cannot copy array data as target array is uniform.");
@@ -1069,9 +1070,9 @@ void AttributeArray::doCopyValues(const AttributeArray& sourceArray, const IterT
             }
         } else {
             // range-checking asserts
-            assert(sourceIndex < sourceArray.dataSize());
-            assert(targetIndex < this->dataSize());
-            if (this->isUniform())  assert(targetIndex == Index(0));
+            OPENVDB_ASSERT(sourceIndex < sourceArray.dataSize());
+            OPENVDB_ASSERT(targetIndex < this->dataSize());
+            if (this->isUniform())  OPENVDB_ASSERT(targetIndex == Index(0));
         }
 
         const size_t targetOffset(targetIndex * bytes);
@@ -1297,13 +1298,13 @@ template<typename ValueType_, typename Codec_>
 void
 TypedAttributeArray<ValueType_, Codec_>::allocate()
 {
-    assert(!mData);
+    OPENVDB_ASSERT(!mData);
     if (mIsUniform) {
         mData.reset(new StorageType[1]);
     }
     else {
         const size_t size(this->dataSize());
-        assert(size > 0);
+        OPENVDB_ASSERT(size > 0);
         mData.reset(new StorageType[size]);
     }
 }
@@ -1398,7 +1399,7 @@ template<typename ValueType_, typename Codec_>
 typename TypedAttributeArray<ValueType_, Codec_>::ValueType
 TypedAttributeArray<ValueType_, Codec_>::getUnsafe(Index n) const
 {
-    assert(n < this->dataSize());
+    OPENVDB_ASSERT(n < this->dataSize());
 
     ValueType val;
     Codec::decode(/*in=*/this->data()[mIsUniform ? 0 : n], /*out=*/val);
@@ -1447,9 +1448,9 @@ template<typename ValueType_, typename Codec_>
 void
 TypedAttributeArray<ValueType_, Codec_>::setUnsafe(Index n, const ValueType& val)
 {
-    assert(n < this->dataSize());
-    assert(!this->isOutOfCore());
-    assert(!this->isUniform());
+    OPENVDB_ASSERT(n < this->dataSize());
+    OPENVDB_ASSERT(!this->isOutOfCore());
+    OPENVDB_ASSERT(!this->isUniform());
 
     // this unsafe method assumes the data is not uniform, however if it is, this redirects the index
     // to zero, which is marginally less efficient but ensures not writing to an illegal address
@@ -1754,7 +1755,7 @@ TypedAttributeArray<ValueType_, Codec_>::readBuffers(std::istream& is)
     uint8_t bloscCompressed(0);
     if (!mIsUniform)    is.read(reinterpret_cast<char*>(&bloscCompressed), sizeof(uint8_t));
 
-    assert(mFlags & PARTIALREAD);
+    OPENVDB_ASSERT(mFlags & PARTIALREAD);
     std::unique_ptr<char[]> buffer(new char[mCompressedBytes]);
     is.read(buffer.get(), mCompressedBytes);
     mCompressedBytes = 0;
@@ -1798,12 +1799,12 @@ TypedAttributeArray<ValueType_, Codec_>::readPagedBuffers(compression::PagedInpu
         size_t compressedBytes(mCompressedBytes);
         mCompressedBytes = 0; // if not set to zero, mPageHandle will attempt to destroy invalid memory
         mFlags = static_cast<uint8_t>(mFlags & ~PARTIALREAD); // mark data read as having completed
-        assert(!mPageHandle);
+        OPENVDB_ASSERT(!mPageHandle);
         mPageHandle = is.createHandle(compressedBytes);
         return;
     }
 
-    assert(mPageHandle);
+    OPENVDB_ASSERT(mPageHandle);
 
     tbb::spin_mutex::scoped_lock lock(mMutex);
 
@@ -1987,8 +1988,8 @@ TypedAttributeArray<ValueType_, Codec_>::doLoadUnsafe(const bool /*compression*/
 
     auto* self = const_cast<TypedAttributeArray<ValueType_, Codec_>*>(this);
 
-    assert(self->mPageHandle);
-    assert(!(self->mFlags & PARTIALREAD));
+    OPENVDB_ASSERT(self->mPageHandle);
+    OPENVDB_ASSERT(!(self->mFlags & PARTIALREAD));
 
     std::unique_ptr<char[]> buffer = self->mPageHandle->read();
 
@@ -2129,7 +2130,7 @@ AttributeHandle<ValueType, CodecType>::AttributeHandle(const AttributeArray& arr
     // bind getter and setter methods
 
     AttributeArray::AccessorBasePtr accessor = mArray->getAccessor();
-    assert(accessor);
+    OPENVDB_ASSERT(accessor);
 
     AttributeArray::Accessor<ValueType>* typedAccessor = static_cast<AttributeArray::Accessor<ValueType>*>(accessor.get());
 
@@ -2169,7 +2170,7 @@ AttributeHandle<ValueType, CodecType>::compatibleType() const
 template <typename ValueType, typename CodecType>
 const AttributeArray& AttributeHandle<ValueType, CodecType>::array() const
 {
-    assert(mArray);
+    OPENVDB_ASSERT(mArray);
     return *mArray;
 }
 
@@ -2177,7 +2178,7 @@ template <typename ValueType, typename CodecType>
 Index AttributeHandle<ValueType, CodecType>::index(Index n, Index m) const
 {
     Index index = n * mStrideOrTotalSize + m;
-    assert(index < (mSize * mStrideOrTotalSize));
+    OPENVDB_ASSERT(index < (mSize * mStrideOrTotalSize));
     return index;
 }
 
@@ -2303,7 +2304,7 @@ AttributeWriteHandle<ValueType, CodecType>::set(Index index, const ValueType& va
 template <typename ValueType, typename CodecType>
 AttributeArray& AttributeWriteHandle<ValueType, CodecType>::array()
 {
-    assert(this->mArray);
+    OPENVDB_ASSERT(this->mArray);
     return *const_cast<AttributeArray*>(this->mArray);
 }
 
diff --git a/openvdb/openvdb/points/AttributeArrayString.cc b/openvdb/openvdb/points/AttributeArrayString.cc
index 077cbb661f..7a5025a4e1 100644
--- a/openvdb/openvdb/points/AttributeArrayString.cc
+++ b/openvdb/openvdb/points/AttributeArrayString.cc
@@ -7,6 +7,7 @@
 
 #include <openvdb/Metadata.h>
 #include <openvdb/MetaMap.h>
+#include <openvdb/util/Assert.h>
 
 #include <tbb/parallel_sort.h>
 
@@ -213,7 +214,7 @@ void StringMetaInserter::resetCache()
 
     for (const Index id : stringIndices) {
         if (key + size != id) {
-            assert(size > 0);
+            OPENVDB_ASSERT(size > 0);
             mIdBlocks.emplace_back(key, size);
             size = 0;
             key = id;
diff --git a/openvdb/openvdb/points/AttributeGroup.cc b/openvdb/openvdb/points/AttributeGroup.cc
index f67f966eb4..c9dff68aab 100644
--- a/openvdb/openvdb/points/AttributeGroup.cc
+++ b/openvdb/openvdb/points/AttributeGroup.cc
@@ -4,6 +4,7 @@
 /// @file points/AttributeGroup.cc
 
 #include "AttributeGroup.h"
+#include <openvdb/util/Assert.h>
 
 
 namespace openvdb {
@@ -21,7 +22,7 @@ GroupHandle::GroupHandle(const GroupAttributeArray& array, const GroupType& offs
         : mArray(array)
         , mBitMask(static_cast<GroupType>(1 << offset))
 {
-    assert(isGroup(mArray));
+    OPENVDB_ASSERT(isGroup(mArray));
 
     // load data if delay-loaded
 
@@ -34,7 +35,7 @@ GroupHandle::GroupHandle(const GroupAttributeArray& array, const GroupType& bitM
     : mArray(array)
     , mBitMask(bitMask)
 {
-    assert(isGroup(mArray));
+    OPENVDB_ASSERT(isGroup(mArray));
 
     // load data if delay-loaded
 
@@ -62,7 +63,7 @@ bool GroupHandle::getUnsafe(Index n) const
 GroupWriteHandle::GroupWriteHandle(GroupAttributeArray& array, const GroupType& offset)
     : GroupHandle(array, offset)
 {
-    assert(isGroup(mArray));
+    OPENVDB_ASSERT(isGroup(mArray));
 }
 
 
diff --git a/openvdb/openvdb/points/AttributeGroup.h b/openvdb/openvdb/points/AttributeGroup.h
index 1af9795e45..449128a592 100644
--- a/openvdb/openvdb/points/AttributeGroup.h
+++ b/openvdb/openvdb/points/AttributeGroup.h
@@ -12,6 +12,7 @@
 
 #include "AttributeArray.h"
 #include "AttributeSet.h"
+#include <openvdb/util/Assert.h>
 #include <memory>
 
 namespace openvdb {
@@ -153,7 +154,7 @@ class GroupFilter
 
     template <typename IterT>
     bool valid(const IterT& iter) const {
-        assert(mHandle);
+        OPENVDB_ASSERT(mHandle);
         return mHandle->getUnsafe(*iter);
     }
 
diff --git a/openvdb/openvdb/points/AttributeSet.cc b/openvdb/openvdb/points/AttributeSet.cc
index 1d17df8cb1..a1d58d2162 100644
--- a/openvdb/openvdb/points/AttributeSet.cc
+++ b/openvdb/openvdb/points/AttributeSet.cc
@@ -5,6 +5,7 @@
 
 #include "AttributeSet.h"
 #include "AttributeGroup.h"
+#include <openvdb/util/Assert.h>
 
 #include <algorithm> // std::equal
 #include <string>
@@ -170,8 +171,8 @@ AttributeSet::replace(const std::string& name, const AttributeArray::Ptr& attr)
 size_t
 AttributeSet::replace(size_t pos, const AttributeArray::Ptr& attr)
 {
-    assert(pos != INVALID_POS);
-    assert(pos < mAttrs.size());
+    OPENVDB_ASSERT(pos != INVALID_POS);
+    OPENVDB_ASSERT(pos < mAttrs.size());
 
     if (attr->type() != mDescr->type(pos)) {
         return INVALID_POS;
@@ -210,8 +211,8 @@ AttributeSet::get(const std::string& name)
 const AttributeArray*
 AttributeSet::getConst(size_t pos) const
 {
-    assert(pos != INVALID_POS);
-    assert(pos < mAttrs.size());
+    OPENVDB_ASSERT(pos != INVALID_POS);
+    OPENVDB_ASSERT(pos < mAttrs.size());
     return mAttrs[pos].get();
 }
 
@@ -219,8 +220,8 @@ AttributeSet::getConst(size_t pos) const
 const AttributeArray*
 AttributeSet::get(size_t pos) const
 {
-    assert(pos != INVALID_POS);
-    assert(pos < mAttrs.size());
+    OPENVDB_ASSERT(pos != INVALID_POS);
+    OPENVDB_ASSERT(pos < mAttrs.size());
     return this->getConst(pos);
 }
 
@@ -278,8 +279,8 @@ AttributeSet::groupAttributeIndices() const
 bool
 AttributeSet::isShared(size_t pos) const
 {
-    assert(pos != INVALID_POS);
-    assert(pos < mAttrs.size());
+    OPENVDB_ASSERT(pos != INVALID_POS);
+    OPENVDB_ASSERT(pos < mAttrs.size());
     // Warning: In multithreaded environment, the value returned by use_count is approximate.
     return mAttrs[pos].use_count() != 1;
 }
@@ -288,8 +289,8 @@ AttributeSet::isShared(size_t pos) const
 void
 AttributeSet::makeUnique(size_t pos)
 {
-    assert(pos != INVALID_POS);
-    assert(pos < mAttrs.size());
+    OPENVDB_ASSERT(pos != INVALID_POS);
+    OPENVDB_ASSERT(pos < mAttrs.size());
     // Warning: In multithreaded environment, the value returned by use_count is approximate.
     if (mAttrs[pos].use_count() != 1) {
         mAttrs[pos] = mAttrs[pos]->copy();
@@ -327,7 +328,7 @@ AttributeSet::appendAttribute(  const Descriptor& expected, DescriptorPtr& repla
         OPENVDB_THROW(LookupError, "Cannot append attributes as descriptors do not match.")
     }
 
-    assert(replacement->size() >= mDescr->size());
+    OPENVDB_ASSERT(replacement->size() >= mDescr->size());
 
     const size_t offset = mDescr->size();
 
@@ -369,10 +370,10 @@ AttributeSet::removeAttribute(const size_t pos)
 {
     if (pos >= mAttrs.size())     return AttributeArray::Ptr();
 
-    assert(mAttrs[pos]);
+    OPENVDB_ASSERT(mAttrs[pos]);
     AttributeArray::Ptr array;
     std::swap(array, mAttrs[pos]);
-    assert(array);
+    OPENVDB_ASSERT(array);
 
     // safely drop the attribute and update the descriptor
     std::vector<size_t> toDrop{pos};
@@ -387,7 +388,7 @@ AttributeSet::removeAttributeUnsafe(const size_t pos)
 {
     if (pos >= mAttrs.size())     return AttributeArray::Ptr();
 
-    assert(mAttrs[pos]);
+    OPENVDB_ASSERT(mAttrs[pos]);
     AttributeArray::Ptr array;
     std::swap(array, mAttrs[pos]);
 
@@ -752,8 +753,8 @@ AttributeSet::Descriptor::type(size_t pos) const
 {
     // assert that pos is valid and in-range
 
-    assert(pos != AttributeSet::INVALID_POS);
-    assert(pos < mTypes.size());
+    OPENVDB_ASSERT(pos != AttributeSet::INVALID_POS);
+    OPENVDB_ASSERT(pos < mTypes.size());
 
     return mTypes[pos];
 }
@@ -851,7 +852,7 @@ AttributeSet::Descriptor::insert(const std::string& name, const NamePair& typeNa
     size_t pos = INVALID_POS;
     auto it = mNameMap.find(name);
     if (it != mNameMap.end()) {
-        assert(it->second < mTypes.size());
+        OPENVDB_ASSERT(it->second < mTypes.size());
         if (mTypes[it->second] != typeName) {
             OPENVDB_THROW(KeyError,
                 "Cannot insert into a Descriptor with a duplicate name, but different type.")
diff --git a/openvdb/openvdb/points/AttributeSet.h b/openvdb/openvdb/points/AttributeSet.h
index 50a9a56a9c..1398f58728 100644
--- a/openvdb/openvdb/points/AttributeSet.h
+++ b/openvdb/openvdb/points/AttributeSet.h
@@ -15,6 +15,7 @@
 #include <openvdb/MetaMap.h>
 
 #include <limits>
+#include <climits>
 #include <memory>
 #include <vector>
 
diff --git a/openvdb/openvdb/points/IndexFilter.h b/openvdb/openvdb/points/IndexFilter.h
index 996a19617e..a9bbf2954f 100644
--- a/openvdb/openvdb/points/IndexFilter.h
+++ b/openvdb/openvdb/points/IndexFilter.h
@@ -42,6 +42,7 @@
 
 #include <openvdb/math/Transform.h>
 #include <openvdb/tools/Interpolation.h>
+#include <openvdb/util/Assert.h>
 
 #include "IndexIterator.h"
 #include "AttributeArray.h"
@@ -198,7 +199,7 @@ class MultiGroupFilter
 
     template <typename IterT>
     bool valid(const IterT& iter) const {
-        assert(mInitialized);
+        OPENVDB_ASSERT(mInitialized);
         // accept no include filters as valid
         bool includeValid = mIncludeHandles.empty();
         for (const GroupHandle& handle : mIncludeHandles) {
@@ -348,13 +349,13 @@ class AttributeHashFilter
 
     template <typename LeafT>
     void reset(const LeafT& leaf) {
-        assert(leaf.hasAttribute(mIndex));
+        OPENVDB_ASSERT(leaf.hasAttribute(mIndex));
         mIdHandle.reset(new Handle(leaf.constAttributeArray(mIndex)));
     }
 
     template <typename IterT>
     bool valid(const IterT& iter) const {
-        assert(mIdHandle);
+        OPENVDB_ASSERT(mIdHandle);
         const IntType id = mIdHandle->get(*iter);
         const unsigned int seed = mSeed + static_cast<unsigned int>(id);
         RandGenT generator(seed);
@@ -410,8 +411,8 @@ class LevelSetFilter
 
     template <typename IterT>
     bool valid(const IterT& iter) const {
-        assert(mPositionHandle);
-        assert(iter);
+        OPENVDB_ASSERT(mPositionHandle);
+        OPENVDB_ASSERT(iter);
 
         const openvdb::Coord ijk = iter.getCoord();
         const openvdb::Vec3f voxelIndexSpace = ijk.asVec3d();
@@ -477,7 +478,7 @@ class BBoxFilter
 
     template <typename IterT>
     bool valid(const IterT& iter) const {
-        assert(mPositionHandle);
+        OPENVDB_ASSERT(mPositionHandle);
 
         const openvdb::Coord ijk = iter.getCoord();
         const openvdb::Vec3f voxelIndexSpace = ijk.asVec3d();
diff --git a/openvdb/openvdb/points/IndexIterator.h b/openvdb/openvdb/points/IndexIterator.h
index 99ca1dd9e5..9a8e300d57 100644
--- a/openvdb/openvdb/points/IndexIterator.h
+++ b/openvdb/openvdb/points/IndexIterator.h
@@ -12,6 +12,7 @@
 
 #include <openvdb/version.h>
 #include <openvdb/Types.h>
+#include <openvdb/util/Assert.h>
 
 namespace openvdb {
 OPENVDB_USE_VERSION_NAMESPACE
@@ -147,7 +148,7 @@ class IndexIter
             : mIter(iter), mParent(&mIter.parent())
         {
             if (mIter) {
-                assert(mParent);
+                OPENVDB_ASSERT(mParent);
                 Index32 start = (mIter.offset() > 0 ?
                     Index32(mParent->getValue(mIter.offset() - 1)) : Index32(0));
                 this->reset(start, *mIter);
@@ -157,7 +158,7 @@ class IndexIter
         ValueIndexIter(const ValueIndexIter& other)
             : mEnd(other.mEnd), mItem(other.mItem), mIter(other.mIter), mParent(other.mParent)
         {
-            assert(mParent);
+            OPENVDB_ASSERT(mParent);
         }
         ValueIndexIter& operator=(const ValueIndexIter&) = default;
 
@@ -169,8 +170,8 @@ class IndexIter
         }
 
         /// @brief  Returns the item to which this iterator is currently pointing.
-        inline Index32 operator*() { assert(mIter); return mItem; }
-        inline Index32 operator*() const { assert(mIter); return mItem; }
+        inline Index32 operator*() { OPENVDB_ASSERT(mIter); return mItem; }
+        inline Index32 operator*() const { OPENVDB_ASSERT(mIter); return mItem; }
 
         /// @brief  Return @c true if this iterator is not yet exhausted.
         inline operator bool() const { return mIter; }
@@ -180,7 +181,7 @@ class IndexIter
         inline ValueIndexIter& operator++() {
             ++mItem;
             while (mItem >= mEnd && mIter.next()) {
-                assert(mParent);
+                OPENVDB_ASSERT(mParent);
                 this->reset(mParent->getValue(mIter.offset() - 1), *mIter);
             }
             return *this;
@@ -191,12 +192,12 @@ class IndexIter
         inline bool increment() { this->next(); return this->test(); }
 
         /// Return the coordinates of the item to which the value iterator is pointing.
-        inline Coord getCoord() const { assert(mIter); return mIter.getCoord(); }
+        inline Coord getCoord() const { OPENVDB_ASSERT(mIter); return mIter.getCoord(); }
         /// Return in @a xyz the coordinates of the item to which the value iterator is pointing.
-        inline void getCoord(Coord& xyz) const { assert(mIter); xyz = mIter.getCoord(); }
+        inline void getCoord(Coord& xyz) const { OPENVDB_ASSERT(mIter); xyz = mIter.getCoord(); }
 
         /// @brief Return @c true if this iterator is pointing to an active value.
-        inline bool isValueOn() const { assert(mIter); return mIter.isValueOn(); }
+        inline bool isValueOn() const { OPENVDB_ASSERT(mIter); return mIter.isValueOn(); }
 
         /// Return the const value iterator
         inline const IteratorT& valueIter() const { return mIter; }
@@ -257,8 +258,8 @@ class IndexIter
     }
 
     /// @brief  Returns the item to which this iterator is currently pointing.
-    Index32 operator*() { assert(mIterator); return *mIterator; }
-    Index32 operator*() const { assert(mIterator); return *mIterator; }
+    Index32 operator*() { OPENVDB_ASSERT(mIterator); return *mIterator; }
+    Index32 operator*() const { OPENVDB_ASSERT(mIterator); return *mIterator; }
 
     /// @brief  Return @c true if this iterator is not yet exhausted.
     operator bool() const { return mIterator.test(); }
@@ -290,12 +291,12 @@ class IndexIter
     inline const FilterT& filter() const { return mFilter; }
 
     /// Return the coordinates of the item to which the value iterator is pointing.
-    inline Coord getCoord() const { assert(mIterator); return mIterator.getCoord(); }
+    inline Coord getCoord() const { OPENVDB_ASSERT(mIterator); return mIterator.getCoord(); }
     /// Return in @a xyz the coordinates of the item to which the value iterator is pointing.
-    inline void getCoord(Coord& xyz) const { assert(mIterator); xyz = mIterator.getCoord(); }
+    inline void getCoord(Coord& xyz) const { OPENVDB_ASSERT(mIterator); xyz = mIterator.getCoord(); }
 
     /// @brief Return @c true if the value iterator is pointing to an active value.
-    inline bool isValueOn() const { assert(mIterator); return mIterator.valueIter().isValueOn(); }
+    inline bool isValueOn() const { OPENVDB_ASSERT(mIterator); return mIterator.valueIter().isValueOn(); }
 
     /// @brief Equality operators
     bool operator==(const IndexIter& other) const { return mIterator == other.mIterator; }
diff --git a/openvdb/openvdb/points/PointAttribute.h b/openvdb/openvdb/points/PointAttribute.h
index 8b54930c47..ae989703a9 100644
--- a/openvdb/openvdb/points/PointAttribute.h
+++ b/openvdb/openvdb/points/PointAttribute.h
@@ -11,6 +11,7 @@
 #define OPENVDB_POINTS_POINT_ATTRIBUTE_HAS_BEEN_INCLUDED
 
 #include <openvdb/openvdb.h>
+#include <openvdb/util/Assert.h>
 
 #include "AttributeArrayString.h"
 #include "AttributeSet.h"
diff --git a/openvdb/openvdb/points/PointConversion.h b/openvdb/openvdb/points/PointConversion.h
index f73ddc0554..bc2dce3d97 100644
--- a/openvdb/openvdb/points/PointConversion.h
+++ b/openvdb/openvdb/points/PointConversion.h
@@ -15,6 +15,7 @@
 #include <openvdb/tools/PointIndexGrid.h>
 #include <openvdb/tools/PointsToMask.h>
 #include <openvdb/util/NullInterrupter.h>
+#include <openvdb/util/Assert.h>
 
 #include "AttributeArrayString.h"
 #include "AttributeSet.h"
diff --git a/openvdb/openvdb/points/PointDataGrid.h b/openvdb/openvdb/points/PointDataGrid.h
index bd560acf39..5e1a7ac21b 100644
--- a/openvdb/openvdb/points/PointDataGrid.h
+++ b/openvdb/openvdb/points/PointDataGrid.h
@@ -17,6 +17,7 @@
 #include <openvdb/tree/Tree.h>
 #include <openvdb/tree/LeafNode.h>
 #include <openvdb/tools/PointIndexGrid.h>
+#include <openvdb/util/Assert.h>
 #include "AttributeArray.h"
 #include "AttributeArrayString.h"
 #include "AttributeGroup.h"
@@ -519,7 +520,7 @@ class PointDataLeafNode : public tree::LeafNode<T, Log2Dim>, io::MultiPass {
     // to the point-array offsets.
 
     void assertNonmodifiable() {
-        assert(false && "Cannot modify voxel values in a PointDataTree.");
+        OPENVDB_ASSERT(false && "Cannot modify voxel values in a PointDataTree.");
     }
 
     // some methods silently ignore attempts to modify the
@@ -960,7 +961,7 @@ inline GroupHandle
 PointDataLeafNode<T, Log2Dim>::groupHandle(const AttributeSet::Descriptor::GroupIndex& index) const
 {
     const AttributeArray& array = this->attributeArray(index.first);
-    assert(isGroup(array));
+    OPENVDB_ASSERT(isGroup(array));
 
     const GroupAttributeArray& groupArray = GroupAttributeArray::cast(array);
 
@@ -980,7 +981,7 @@ inline GroupWriteHandle
 PointDataLeafNode<T, Log2Dim>::groupWriteHandle(const AttributeSet::Descriptor::GroupIndex& index)
 {
     AttributeArray& array = this->attributeArray(index.first);
-    assert(isGroup(array));
+    OPENVDB_ASSERT(isGroup(array));
 
     GroupAttributeArray& groupArray = GroupAttributeArray::cast(array);
 
@@ -1025,7 +1026,7 @@ inline ValueVoxelCIter
 PointDataLeafNode<T, Log2Dim>::beginValueVoxel(const Coord& ijk) const
 {
     const Index index = LeafNodeType::coordToOffset(ijk);
-    assert(index < BaseLeaf::SIZE);
+    OPENVDB_ASSERT(index < BaseLeaf::SIZE);
     const ValueType end = this->getValue(index);
     const ValueType start = (index == 0) ? ValueType(0) : this->getValue(index - 1);
     return ValueVoxelCIter(start, end);
@@ -1215,7 +1216,7 @@ PointDataLeafNode<T, Log2Dim>::readBuffers(std::istream& is, const CoordBBox& /*
         {
             std::string descriptorKey("descriptorPtr");
             auto itDescriptor = auxData.find(descriptorKey);
-            assert(itDescriptor != auxData.end());
+            OPENVDB_ASSERT(itDescriptor != auxData.end());
             const Descriptor::Ptr descriptor = std::any_cast<AttributeSet::Descriptor::Ptr>(itDescriptor->second);
             return descriptor;
         }
@@ -1369,14 +1370,14 @@ PointDataLeafNode<T, Log2Dim>::writeBuffers(std::ostream& os, bool toHalf) const
             if (itMatching == auxData.end()) {
                 // if matching bool is not found, insert "true" and the descriptor
                 (const_cast<io::StreamMetadata::AuxDataMap&>(auxData))[matchingKey] = true;
-                assert(itDescriptor == auxData.end());
+                OPENVDB_ASSERT(itDescriptor == auxData.end());
                 (const_cast<io::StreamMetadata::AuxDataMap&>(auxData))[descriptorKey] = descriptor;
             }
             else {
                 // if matching bool is found and is false, early exit (a previous descriptor did not match)
                 bool matching = std::any_cast<bool>(itMatching->second);
                 if (!matching)    return;
-                assert(itDescriptor != auxData.end());
+                OPENVDB_ASSERT(itDescriptor != auxData.end());
                 // if matching bool is true, check whether the existing descriptor matches the current one and set
                 // matching bool to false if not
                 const Descriptor::Ptr existingDescriptor = std::any_cast<AttributeSet::Descriptor::Ptr>(itDescriptor->second);
@@ -1638,7 +1639,7 @@ prefetch(PointDataTreeT& tree, bool position, bool otherAttributes)
 
     if (position && positionIndex != AttributeSet::INVALID_POS) {
         for (leaf = tree.cbeginLeaf(); leaf; ++leaf) {
-            assert(leaf->hasAttribute(positionIndex));
+            OPENVDB_ASSERT(leaf->hasAttribute(positionIndex));
             leaf->constAttributeArray(positionIndex).loadData();
         }
     }
@@ -1650,7 +1651,7 @@ prefetch(PointDataTreeT& tree, bool position, bool otherAttributes)
         for (size_t attributeIndex = 0; attributeIndex < attributes; attributeIndex++) {
             if (attributeIndex == positionIndex)     continue;
             for (leaf = tree.cbeginLeaf(); leaf; ++leaf) {
-                assert(leaf->hasAttribute(attributeIndex));
+                OPENVDB_ASSERT(leaf->hasAttribute(attributeIndex));
                 leaf->constAttributeArray(attributeIndex).loadData();
             }
         }
diff --git a/openvdb/openvdb/points/PointDelete.h b/openvdb/openvdb/points/PointDelete.h
index c7504efb9e..8c9ee5a36c 100644
--- a/openvdb/openvdb/points/PointDelete.h
+++ b/openvdb/openvdb/points/PointDelete.h
@@ -17,6 +17,7 @@
 
 #include <openvdb/tools/Prune.h>
 #include <openvdb/tree/LeafManager.h>
+#include <openvdb/util/Assert.h>
 
 #include <memory>
 #include <string>
diff --git a/openvdb/openvdb/points/PointMask.h b/openvdb/openvdb/points/PointMask.h
index cb01992cfc..cbd3dc6a89 100644
--- a/openvdb/openvdb/points/PointMask.h
+++ b/openvdb/openvdb/points/PointMask.h
@@ -12,6 +12,7 @@
 
 #include <openvdb/openvdb.h>
 #include <openvdb/tools/ValueTransformer.h> // valxform::SumOp
+#include <openvdb/util/Assert.h>
 
 #include "PointDataGrid.h"
 #include "IndexFilter.h"
diff --git a/openvdb/openvdb/points/PointMove.h b/openvdb/openvdb/points/PointMove.h
index ce8ebf20bd..721dd13501 100644
--- a/openvdb/openvdb/points/PointMove.h
+++ b/openvdb/openvdb/points/PointMove.h
@@ -33,6 +33,7 @@
 #define OPENVDB_POINTS_POINT_MOVE_HAS_BEEN_INCLUDED
 
 #include <openvdb/openvdb.h>
+#include <openvdb/util/Assert.h>
 
 #include "PointDataGrid.h"
 #include "PointMask.h"
diff --git a/openvdb/openvdb/points/PointRasterizeFrustum.h b/openvdb/openvdb/points/PointRasterizeFrustum.h
index 9449694113..fea31761f6 100644
--- a/openvdb/openvdb/points/PointRasterizeFrustum.h
+++ b/openvdb/openvdb/points/PointRasterizeFrustum.h
@@ -13,6 +13,7 @@
 #include <openvdb/math/Ray.h>
 #include <openvdb/math/DDA.h>
 #include <openvdb/util/NullInterrupter.h>
+#include <openvdb/util/Assert.h>
 #include <openvdb/thread/Threading.h>
 #include <openvdb/tools/GridTransformer.h> // for tools::resampleToMatch()
 #include <openvdb/tools/Interpolation.h>
diff --git a/openvdb/openvdb/points/PointRasterizeSDF.h b/openvdb/openvdb/points/PointRasterizeSDF.h
index abf506f538..c7b60b598a 100644
--- a/openvdb/openvdb/points/PointRasterizeSDF.h
+++ b/openvdb/openvdb/points/PointRasterizeSDF.h
@@ -50,6 +50,7 @@
 #include <openvdb/tools/ValueTransformer.h>
 #include <openvdb/thread/Threading.h>
 #include <openvdb/util/NullInterrupter.h>
+#include <openvdb/util/Assert.h>
 
 #include <unordered_map>
 
diff --git a/openvdb/openvdb/points/PointRasterizeTrilinear.h b/openvdb/openvdb/points/PointRasterizeTrilinear.h
index 5279c53b57..2f347bfdba 100644
--- a/openvdb/openvdb/points/PointRasterizeTrilinear.h
+++ b/openvdb/openvdb/points/PointRasterizeTrilinear.h
@@ -18,6 +18,7 @@
 #include <openvdb/math/Transform.h>
 #include <openvdb/tools/Morphology.h>
 #include <openvdb/tree/ValueAccessor.h>
+#include <openvdb/util/Assert.h>
 
 #include "PointDataGrid.h"
 #include "PointMask.h"
diff --git a/openvdb/openvdb/points/PointReplicate.h b/openvdb/openvdb/points/PointReplicate.h
index f16810ad3d..d14520e0b3 100644
--- a/openvdb/openvdb/points/PointReplicate.h
+++ b/openvdb/openvdb/points/PointReplicate.h
@@ -12,6 +12,7 @@
 
 #include <openvdb/points/PointDataGrid.h>
 #include <openvdb/tools/Prune.h>
+#include <openvdb/util/Assert.h>
 
 namespace openvdb {
 OPENVDB_USE_VERSION_NAMESPACE
diff --git a/openvdb/openvdb/points/PointSample.h b/openvdb/openvdb/points/PointSample.h
index 1ff49b611f..c2c883e692 100644
--- a/openvdb/openvdb/points/PointSample.h
+++ b/openvdb/openvdb/points/PointSample.h
@@ -13,6 +13,7 @@
 #include <openvdb/util/NullInterrupter.h>
 #include <openvdb/thread/Threading.h>
 #include <openvdb/tools/Interpolation.h>
+#include <openvdb/util/Assert.h>
 
 #include "PointDataGrid.h"
 #include "PointAttribute.h"
diff --git a/openvdb/openvdb/points/PointScatter.h b/openvdb/openvdb/points/PointScatter.h
index e089da513c..cf1c547ae8 100644
--- a/openvdb/openvdb/points/PointScatter.h
+++ b/openvdb/openvdb/points/PointScatter.h
@@ -24,6 +24,7 @@
 #include <openvdb/tree/LeafManager.h>
 #include <openvdb/tools/Prune.h>
 #include <openvdb/util/NullInterrupter.h>
+#include <openvdb/util/Assert.h>
 
 #include "AttributeArray.h"
 #include "PointCount.h"
diff --git a/openvdb/openvdb/points/PointStatistics.h b/openvdb/openvdb/points/PointStatistics.h
index 74e310576d..9fcd43696a 100644
--- a/openvdb/openvdb/points/PointStatistics.h
+++ b/openvdb/openvdb/points/PointStatistics.h
@@ -20,6 +20,7 @@
 #include <openvdb/Types.h>
 #include <openvdb/math/Math.h>
 #include <openvdb/tree/LeafManager.h>
+#include <openvdb/util/Assert.h>
 
 #include <tbb/parallel_reduce.h>
 #include <tbb/parallel_for.h>
diff --git a/openvdb/openvdb/points/PointTransfer.h b/openvdb/openvdb/points/PointTransfer.h
index d4496bbbc2..8f9bfc3806 100644
--- a/openvdb/openvdb/points/PointTransfer.h
+++ b/openvdb/openvdb/points/PointTransfer.h
@@ -25,6 +25,7 @@
 #include <openvdb/Grid.h>
 #include <openvdb/math/Transform.h>
 #include <openvdb/util/NullInterrupter.h>
+#include <openvdb/util/Assert.h>
 #include <openvdb/thread/Threading.h>
 
 #include <type_traits>
@@ -285,7 +286,7 @@ struct VolumeTransfer<TreeT>
         : mTree(tree)
         , mBuffer(nullptr)
         , mMask(nullptr) {
-        assert(tree);
+        OPENVDB_ASSERT(tree);
     }
 
     VolumeTransfer(TreeType& tree)
@@ -300,7 +301,7 @@ struct VolumeTransfer<TreeT>
 
     inline void initialize(const Coord& origin, const size_t, const CoordBBox&)
     {
-        assert(mTree);
+        OPENVDB_ASSERT(mTree);
         if (auto leaf = mTree->probeLeaf(origin)) {
             mBuffer = leaf->buffer().data();
             mMask = &(leaf->getValueMask());
@@ -367,7 +368,7 @@ VolumeTransfer<TreeTypes...>::VolumeTransfer(TreeTypes*... trees)
         static_assert(std::is_base_of<TreeBase, TreeT>::value,
             "One or more template arguments to VolumeTransfer "
             "are not a valid openvdb::Tree type.");
-        assert(tree);
+        OPENVDB_ASSERT(tree);
     }, std::make_integer_sequence<size_t, Size>());
 
     mBuffers.fill(nullptr);
@@ -379,7 +380,7 @@ inline void VolumeTransfer<TreeTypes...>::initialize(const Coord& origin, const
 {
     transfer_internal::foreach(mTreeArray,
         [&](auto&& tree, const size_t i) {
-            assert(tree);
+            OPENVDB_ASSERT(tree);
             if (auto leaf = tree->probeLeaf(origin)) {
                 mBuffers[i] = static_cast<void*>(leaf->buffer().data());
                 mMasks[i] = &(leaf->getValueMask());
@@ -444,7 +445,7 @@ struct RasterizePoints
             // Use evalActiveBoundingBox over getNodeBoundingBox()
             // to get a better approximation
             leaf.evalActiveBoundingBox(bounds);
-            assert(!bounds.empty());
+            OPENVDB_ASSERT(!bounds.empty());
         }
 
         mTransfer.initialize(origin, idx, bounds);
@@ -483,7 +484,7 @@ struct RasterizePoints
                             const Index ij = i + ((ijk.y() & (DIM-1u)) << LOG2DIM);
                             for (ijk.z() = pmin.z(); ijk.z() <= pmax.z(); ++ijk.z()) {
                                 // voxel should be in this points leaf
-                                assert((ijk & ~(DIM-1u)) == leafOrigin);
+                                OPENVDB_ASSERT((ijk & ~(DIM-1u)) == leafOrigin);
                                 const Index index = ij + /*k*/(ijk.z() & (DIM-1u));
                                 const Index end = pointLeaf->getValue(index);
                                 Index id = (index == 0) ? 0 : Index(pointLeaf->getValue(index - 1));
diff --git a/openvdb/openvdb/points/StreamCompression.cc b/openvdb/openvdb/points/StreamCompression.cc
index 9efedc8e42..ad4f8990d6 100644
--- a/openvdb/openvdb/points/StreamCompression.cc
+++ b/openvdb/openvdb/points/StreamCompression.cc
@@ -5,6 +5,7 @@
 
 #include "StreamCompression.h"
 #include <openvdb/util/logging.h>
+#include <openvdb/util/Assert.h>
 #include <map>
 #ifdef OPENVDB_USE_BLOSC
 #include <blosc.h>
@@ -288,7 +289,7 @@ Page::load() const
 long
 Page::uncompressedBytes() const
 {
-    assert(mInfo);
+    OPENVDB_ASSERT(mInfo);
     return mInfo->uncompressedBytes;
 }
 
@@ -307,7 +308,7 @@ Page::buffer(const int index) const
 void
 Page::readHeader(std::istream& is)
 {
-    assert(mInfo);
+    OPENVDB_ASSERT(mInfo);
 
     // read the (compressed) size of the page
     int compressedSize;
@@ -318,8 +319,8 @@ Page::readHeader(std::istream& is)
     if (compressedSize > 0)     is.read(reinterpret_cast<char*>(&uncompressedSize), sizeof(int));
     else                        uncompressedSize = -compressedSize;
 
-    assert(compressedSize != 0);
-    assert(uncompressedSize != 0);
+    OPENVDB_ASSERT(compressedSize != 0);
+    OPENVDB_ASSERT(uncompressedSize != 0);
 
     mInfo->compressedBytes = compressedSize;
     mInfo->uncompressedBytes = uncompressedSize;
@@ -331,7 +332,7 @@ Page::readBuffers(std::istream&is, bool delayed)
 {
     (void) delayed;
 
-    assert(mInfo);
+    OPENVDB_ASSERT(mInfo);
 
     bool isCompressed = mInfo->compressedBytes > 0;
 
@@ -340,7 +341,7 @@ Page::readBuffers(std::istream&is, bool delayed)
 
     if (delayed && mappedFile) {
         SharedPtr<io::StreamMetadata> meta = io::getStreamMetadataPtr(is);
-        assert(meta);
+        OPENVDB_ASSERT(meta);
 
         std::streamoff filepos = is.tellg();
 
@@ -352,7 +353,7 @@ Page::readBuffers(std::istream&is, bool delayed)
         mInfo->meta = meta;
         mInfo->filepos = filepos;
 
-        assert(mInfo->mappedFile);
+        OPENVDB_ASSERT(mInfo->mappedFile);
     }
     else {
 #endif
@@ -418,19 +419,19 @@ Page::doLoad() const
     tbb::spin_mutex::scoped_lock lock(self->mMutex);
     if (!this->isOutOfCore()) return;
 
-    assert(self->mInfo);
+    OPENVDB_ASSERT(self->mInfo);
 
     int compressedBytes = static_cast<int>(self->mInfo->compressedBytes);
     bool compressed = compressedBytes > 0;
     if (!compressed) compressedBytes = -compressedBytes;
 
-    assert(compressedBytes);
+    OPENVDB_ASSERT(compressedBytes);
 
     std::unique_ptr<char[]> temp(new char[compressedBytes]);
 
-    assert(self->mInfo->mappedFile);
+    OPENVDB_ASSERT(self->mInfo->mappedFile);
     SharedPtr<std::streambuf> buf = self->mInfo->mappedFile->createBuffer();
-    assert(buf);
+    OPENVDB_ASSERT(buf);
 
     std::istream is(buf.get());
     io::setStreamMetadataPtr(is, self->mInfo->meta, /*transfer=*/true);
@@ -460,7 +461,7 @@ PageHandle::PageHandle( const Page::Ptr& page, const int index, const int size)
 Page&
 PageHandle::page()
 {
-    assert(mPage);
+    OPENVDB_ASSERT(mPage);
     return *mPage;
 }
 
@@ -468,8 +469,8 @@ PageHandle::page()
 std::unique_ptr<char[]>
 PageHandle::read()
 {
-    assert(mIndex >= 0);
-    assert(mSize > 0);
+    OPENVDB_ASSERT(mIndex >= 0);
+    OPENVDB_ASSERT(mSize > 0);
     std::unique_ptr<char[]> buffer(new char[mSize]);
     std::memcpy(buffer.get(), mPage->buffer(mIndex), mSize);
     return buffer;
@@ -488,7 +489,7 @@ PagedInputStream::PagedInputStream(std::istream& is)
 PageHandle::Ptr
 PagedInputStream::createHandle(std::streamsize n)
 {
-    assert(mByteIndex <= mUncompressedBytes);
+    OPENVDB_ASSERT(mByteIndex <= mUncompressedBytes);
 
     if (mByteIndex == mUncompressedBytes) {
 
@@ -510,7 +511,7 @@ PagedInputStream::createHandle(std::streamsize n)
 void
 PagedInputStream::read(PageHandle::Ptr& pageHandle, std::streamsize n, bool delayed)
 {
-    assert(mByteIndex <= mUncompressedBytes);
+    OPENVDB_ASSERT(mByteIndex <= mUncompressedBytes);
 
     Page& page = pageHandle->page();
 
@@ -580,7 +581,7 @@ PagedOutputStream::compressAndWrite(const char* buffer, size_t size)
 {
     if (size == 0)  return;
 
-    assert(size < std::numeric_limits<int>::max());
+    OPENVDB_ASSERT(size < std::numeric_limits<int>::max());
 
     this->resize(size);
 
diff --git a/openvdb/openvdb/points/StreamCompression.h b/openvdb/openvdb/points/StreamCompression.h
index f2e0d60e92..d9f77bd1c6 100644
--- a/openvdb/openvdb/points/StreamCompression.h
+++ b/openvdb/openvdb/points/StreamCompression.h
@@ -20,6 +20,7 @@
 #define OPENVDB_TOOLS_STREAM_COMPRESSION_HAS_BEEN_INCLUDED
 
 #include <openvdb/io/io.h>
+#include <openvdb/util/Assert.h>
 #include <tbb/spin_mutex.h>
 #include <memory>
 #include <string>
@@ -217,7 +218,7 @@ class OPENVDB_API PagedInputStream
     bool sizeOnly() const { return mSizeOnly; }
 
     // @brief Set and get the input stream
-    std::istream& getInputStream() { assert(mIs); return *mIs; }
+    std::istream& getInputStream() { OPENVDB_ASSERT(mIs); return *mIs; }
     void setInputStream(std::istream& is) { mIs = &is; }
 
     /// @brief Creates a PageHandle to access the next @param n bytes of the Page.
@@ -254,7 +255,7 @@ class OPENVDB_API PagedOutputStream
     bool sizeOnly() const { return mSizeOnly; }
 
     /// @brief Set and get the output stream
-    std::ostream& getOutputStream() { assert(mOs); return *mOs; }
+    std::ostream& getOutputStream() { OPENVDB_ASSERT(mOs); return *mOs; }
     void setOutputStream(std::ostream& os) { mOs = &os; }
 
     /// @brief Writes the given @param str buffer of size @param n
diff --git a/openvdb/openvdb/points/impl/PointAttributeImpl.h b/openvdb/openvdb/points/impl/PointAttributeImpl.h
index 6100bbf9d1..fb8d595da4 100644
--- a/openvdb/openvdb/points/impl/PointAttributeImpl.h
+++ b/openvdb/openvdb/points/impl/PointAttributeImpl.h
@@ -222,7 +222,7 @@ inline void collapseAttribute(  PointDataTreeT& tree,
     tree::LeafManager<PointDataTreeT> leafManager(tree);
     leafManager.foreach(
         [&](typename PointDataTreeT::LeafNodeType& leaf, size_t /*idx*/) {
-            assert(leaf.hasAttribute(index));
+            OPENVDB_ASSERT(leaf.hasAttribute(index));
             AttributeArray& array = leaf.attributeArray(index);
             point_attribute_internal::collapseAttribute(
                 array, descriptor, uniformValue);
@@ -354,7 +354,7 @@ inline void renameAttributes(   PointDataTreeT& tree,
         }
 
         const AttributeArray* array = attributeSet.getConst(oldName);
-        assert(array);
+        OPENVDB_ASSERT(array);
 
         if (isGroup(*array)) {
             OPENVDB_THROW(KeyError, "Cannot rename group attribute - " << oldName << ".");
diff --git a/openvdb/openvdb/points/impl/PointConversionImpl.h b/openvdb/openvdb/points/impl/PointConversionImpl.h
index f812c03257..0ac4e7aee7 100644
--- a/openvdb/openvdb/points/impl/PointConversionImpl.h
+++ b/openvdb/openvdb/points/impl/PointConversionImpl.h
@@ -167,7 +167,7 @@ struct ConvertPointDataGridPositionOp {
 
         for (auto leaf = range.begin(); leaf; ++leaf) {
 
-            assert(leaf.pos() < mPointOffsets.size());
+            OPENVDB_ASSERT(leaf.pos() < mPointOffsets.size());
 
             if (mInCoreOnly && leaf->buffer().isOutOfCore())    continue;
 
@@ -255,7 +255,7 @@ struct ConvertPointDataGridAttributeOp {
 
         for (auto leaf = range.begin(); leaf; ++leaf) {
 
-            assert(leaf.pos() < mPointOffsets.size());
+            OPENVDB_ASSERT(leaf.pos() < mPointOffsets.size());
 
             if (mInCoreOnly && leaf->buffer().isOutOfCore())    continue;
 
@@ -335,7 +335,7 @@ struct ConvertPointDataGridGroupOp {
     {
         for (auto leaf = range.begin(); leaf; ++leaf) {
 
-            assert(leaf.pos() < mPointOffsets.size());
+            OPENVDB_ASSERT(leaf.pos() < mPointOffsets.size());
 
             if (mInCoreOnly && leaf->buffer().isOutOfCore())    continue;
 
@@ -344,7 +344,7 @@ struct ConvertPointDataGridGroupOp {
             if (leaf.pos() > 0)     offset += mPointOffsets[leaf.pos() - 1];
 
             const AttributeArray& array = leaf->constAttributeArray(mIndex.first);
-            assert(isGroup(array));
+            OPENVDB_ASSERT(isGroup(array));
             const GroupAttributeArray& groupArray = GroupAttributeArray::cast(array);
 
             if (mFilter.state() == index::ALL) {
@@ -448,7 +448,7 @@ createPointDataGrid(const PointIndexGridT& pointIndexGrid,
     // retrieve position index
 
     const size_t positionIndex = descriptor->find("P");
-    assert(positionIndex != AttributeSet::INVALID_POS);
+    OPENVDB_ASSERT(positionIndex != AttributeSet::INVALID_POS);
 
     // acquire registry lock to avoid locking when appending attributes in parallel
 
@@ -463,7 +463,7 @@ createPointDataGrid(const PointIndexGridT& pointIndexGrid,
             // obtain the PointIndexLeafNode (using the origin of the current leaf)
 
             const auto* pointIndexLeaf = pointIndexTree.probeConstLeaf(leaf.origin());
-            assert(pointIndexLeaf);
+            OPENVDB_ASSERT(pointIndexLeaf);
 
             // initialise the attribute storage
 
diff --git a/openvdb/openvdb/points/impl/PointDeleteImpl.h b/openvdb/openvdb/points/impl/PointDeleteImpl.h
index 01f98c3dad..5b082a7f31 100644
--- a/openvdb/openvdb/points/impl/PointDeleteImpl.h
+++ b/openvdb/openvdb/points/impl/PointDeleteImpl.h
@@ -25,8 +25,8 @@ struct VectorWrapper
     VectorWrapper(const T& _data) : data(_data) { }
     operator bool() const { return index < data.size(); }
     VectorWrapper& operator++() { index++; return *this; }
-    Index sourceIndex() const { assert(*this); return data[index].first; }
-    Index targetIndex() const { assert(*this); return data[index].second; }
+    Index sourceIndex() const { OPENVDB_ASSERT(*this); return data[index].first; }
+    Index targetIndex() const { OPENVDB_ASSERT(*this); return data[index].second; }
 
 private:
     const T& data;
diff --git a/openvdb/openvdb/points/impl/PointGroupImpl.h b/openvdb/openvdb/points/impl/PointGroupImpl.h
index 885024e7da..1fb1c32472 100644
--- a/openvdb/openvdb/points/impl/PointGroupImpl.h
+++ b/openvdb/openvdb/points/impl/PointGroupImpl.h
@@ -255,7 +255,7 @@ inline void appendGroup(PointDataTreeT& tree, const Name& group)
 
     // ensure that there are now available groups
 
-    assert(descriptor->unusedGroups() > 0);
+    OPENVDB_ASSERT(descriptor->unusedGroups() > 0);
 
     // find next unused offset
 
@@ -422,7 +422,7 @@ inline void compactGroups(PointDataTreeT& tree)
 
     const size_t totalAttributesToDrop = descriptor->unusedGroups() / descriptor->groupBits();
 
-    assert(totalAttributesToDrop <= indices.size());
+    OPENVDB_ASSERT(totalAttributesToDrop <= indices.size());
 
     const std::vector<size_t> indicesToDrop(indices.end() - totalAttributesToDrop,
         indices.end());
diff --git a/openvdb/openvdb/points/impl/PointMaskImpl.h b/openvdb/openvdb/points/impl/PointMaskImpl.h
index b6634f9a48..474cdb815c 100644
--- a/openvdb/openvdb/points/impl/PointMaskImpl.h
+++ b/openvdb/openvdb/points/impl/PointMaskImpl.h
@@ -94,7 +94,7 @@ struct PointsToScalarOp
         // assumes matching topology
         const auto* const pointLeaf =
             mPointDataAccessor.probeConstLeaf(leaf.origin());
-        assert(pointLeaf);
+        OPENVDB_ASSERT(pointLeaf);
 
         for (auto value = leaf.beginValueOn(); value; ++value) {
             const auto iter = pointLeaf->beginIndexVoxel(value.getCoord(), mFilter);
@@ -173,7 +173,7 @@ struct PointsToTransformedScalarOp
             // increment count in target voxel
 
             auto* newLeaf = accessor.touchLeaf(ijk);
-            assert(newLeaf);
+            OPENVDB_ASSERT(newLeaf);
             voxelSum(*newLeaf, newLeaf->coordToOffset(ijk), ValueT(1));
         }
     }
diff --git a/openvdb/openvdb/points/impl/PointMoveImpl.h b/openvdb/openvdb/points/impl/PointMoveImpl.h
index 984c901f79..4b76d8f7cf 100644
--- a/openvdb/openvdb/points/impl/PointMoveImpl.h
+++ b/openvdb/openvdb/points/impl/PointMoveImpl.h
@@ -169,7 +169,7 @@ struct BuildMoveMapsOp
             // determine target leaf node origin and offset in the target leaf vector
 
             Coord targetLeafOrigin = targetVoxel & ~(LeafT::DIM - 1);
-            assert(mTargetLeafMap.find(targetLeafOrigin) != mTargetLeafMap.end());
+            OPENVDB_ASSERT(mTargetLeafMap.find(targetLeafOrigin) != mTargetLeafMap.end());
             const LeafIndex targetLeafOffset(mTargetLeafMap.at(targetLeafOrigin));
 
             // insert into move map based on whether point ends up in a new leaf node or not
@@ -265,13 +265,13 @@ struct GlobalMovePointsOp
 
         Index sourceIndex() const
         {
-            assert(mIt);
+            OPENVDB_ASSERT(mIt);
             return std::get<2>(*mIt);
         }
 
         Index targetIndex() const
         {
-            assert(mIt);
+            OPENVDB_ASSERT(mIt);
             return indexOffsetFromVoxel(std::get<1>(*mIt), mLeaf, mOffsets);
         }
 
@@ -419,7 +419,7 @@ struct LocalMovePointsOp
 
         // extract source array that has the same origin as the target leaf
 
-        assert(idx < mSourceIndices.size());
+        OPENVDB_ASSERT(idx < mSourceIndices.size());
         const Index sourceLeafOffset(mSourceIndices[idx]);
         LeafT& sourceLeaf = mSourceLeafManager.leaf(sourceLeafOffset);
         const auto& sourceArray = sourceLeaf.constAttributeArray(mAttributeIndex);
@@ -468,7 +468,7 @@ inline void movePoints( PointDataGridT& points,
     using namespace point_move_internal;
 
     // this object is for future use only
-    assert(!objectNotInUse);
+    OPENVDB_ASSERT(!objectNotInUse);
     (void)objectNotInUse;
 
     PointDataTreeT& tree = points.tree();
@@ -742,7 +742,7 @@ template <typename T>
 template <typename IndexIterT>
 void CachedDeformer<T>::apply(Vec3d& position, const IndexIterT& iter) const
 {
-    assert(*iter >= 0);
+    OPENVDB_ASSERT(*iter >= 0);
 
     if (mLeafMap) {
         auto it = mLeafMap->find(*iter);
@@ -750,10 +750,10 @@ void CachedDeformer<T>::apply(Vec3d& position, const IndexIterT& iter) const
         position = static_cast<openvdb::Vec3d>(it->second);
     }
     else {
-        assert(mLeafVec);
+        OPENVDB_ASSERT(mLeafVec);
 
         if (mLeafVec->empty())          return;
-        assert(*iter < mLeafVec->size());
+        OPENVDB_ASSERT(*iter < mLeafVec->size());
         position = static_cast<openvdb::Vec3d>((*mLeafVec)[*iter]);
     }
 }
diff --git a/openvdb/openvdb/points/impl/PointRasterizeFrustumImpl.h b/openvdb/openvdb/points/impl/PointRasterizeFrustumImpl.h
index 9878751cca..9fecd22f69 100644
--- a/openvdb/openvdb/points/impl/PointRasterizeFrustumImpl.h
+++ b/openvdb/openvdb/points/impl/PointRasterizeFrustumImpl.h
@@ -207,7 +207,7 @@ struct RasterizeOp
     static void rasterVoxelSphere(const Vec3d& position, const double scale,
         const AttributeT& attributeScale, const float radius, util::NullInterrupter* interrupter, SphereOpT& op)
     {
-        assert(radius > 0.0f);
+        OPENVDB_ASSERT(radius > 0.0f);
         Coord ijk = Coord::round(position);
         int &i = ijk[0], &j = ijk[1], &k = ijk[2];
         const int imin=math::Floor(position[0]-radius), imax=math::Ceil(position[0]+radius);
@@ -1119,7 +1119,7 @@ void RasterCamera::simplify()
 bool RasterCamera::hasWeight(Index i) const
 {
     if (mWeights.empty())  return false;
-    assert(i < mWeights.size());
+    OPENVDB_ASSERT(i < mWeights.size());
     return !openvdb::math::isApproxEqual(mWeights[i], 1.0f, 1e-3f);
 }
 
@@ -1128,7 +1128,7 @@ float RasterCamera::weight(Index i) const
     if (mWeights.empty()) {
         return 1.0f;
     } else {
-        assert(i < mWeights.size());
+        OPENVDB_ASSERT(i < mWeights.size());
         return mWeights[i];
     }
 }
@@ -1138,20 +1138,20 @@ const math::Transform& RasterCamera::transform(Index i) const
     if (mTransforms.size() == 1) {
         return mTransforms.front();
     } else {
-        assert(i < mTransforms.size());
+        OPENVDB_ASSERT(i < mTransforms.size());
         return mTransforms[i];
     }
 }
 
 const math::Transform& RasterCamera::firstTransform() const
 {
-    assert(!mTransforms.empty());
+    OPENVDB_ASSERT(!mTransforms.empty());
     return mTransforms.front();
 }
 
 const math::Transform& RasterCamera::lastTransform() const
 {
-    assert(!mTransforms.empty());
+    OPENVDB_ASSERT(!mTransforms.empty());
     return mTransforms.back();
 }
 
diff --git a/openvdb/openvdb/points/impl/PointRasterizeSDFImpl.h b/openvdb/openvdb/points/impl/PointRasterizeSDFImpl.h
index 20f560f599..37bfef7ff0 100644
--- a/openvdb/openvdb/points/impl/PointRasterizeSDFImpl.h
+++ b/openvdb/openvdb/points/impl/PointRasterizeSDFImpl.h
@@ -74,7 +74,7 @@ struct VaryingRadius
     /// @brief  Compute a fixed radius for a specific point
     inline const FixedRadius<ValueT> eval(const Index id) const
     {
-        assert(mRHandle);
+        OPENVDB_ASSERT(mRHandle);
         return FixedRadius<ValueT>(mRHandle->get(id) * mScale);
     }
 
@@ -163,7 +163,7 @@ struct SignedDistanceFieldTransfer :
         , mDx(surface.voxelSize()[0])
         , mIds(ids)
         , mPLeafMask(0) {
-            assert(cpg && ids);
+            OPENVDB_ASSERT(cpg && ids);
         }
 
     /// @brief Constructor to use when a closet point grid is in use
@@ -487,7 +487,7 @@ struct AveragePositionTransfer :
                     // k(s) = max(0,(1−s^2)^3). note that the max is unecessary
                     // as we early terminate above with x2y2z2 >= mMaxSearchSqIS
                     x2y2z2 = math::Pow3(1.0 - x2y2z2);
-                    assert(x2y2z2 >= 0.0);
+                    OPENVDB_ASSERT(x2y2z2 >= 0.0);
                     // @todo The surface buffer may not be at RealT precision. Should we
                     //  enforce this by storing the weights in another vector?
                     OPENVDB_NO_TYPE_CONVERSION_WARNING_BEGIN
@@ -886,11 +886,11 @@ transferAttributes(const tree::LeafManager<const PointDataTreeT>& manager,
                    const Int64Tree& cpg,
                    const math::Transform::Ptr transform)
 {
-    assert(manager.leafCount() != 0);
+    OPENVDB_ASSERT(manager.leafCount() != 0);
     // masking uses upper 32 bits for leaf node id
     // @note we can use a point list impl to support larger counts
     // if necessary but this is far faster
-    assert(manager.leafCount() <
+    OPENVDB_ASSERT(manager.leafCount() <
         size_t(std::numeric_limits<Index>::max()));
 
     // linearise cpg to avoid having to probe data
diff --git a/openvdb/openvdb/points/impl/PointRasterizeTrilinearImpl.h b/openvdb/openvdb/points/impl/PointRasterizeTrilinearImpl.h
index 4500dc750f..20a83269f9 100644
--- a/openvdb/openvdb/points/impl/PointRasterizeTrilinearImpl.h
+++ b/openvdb/openvdb/points/impl/PointRasterizeTrilinearImpl.h
@@ -143,7 +143,7 @@ struct StaggeredTransfer :
                 macw.y() = value(P.y() - (y-RealT(0.5)));
 
                 for (c.z() = a.z(); c.z() <= b.z(); ++c.z()) {
-                    assert(bounds.isInside(c));
+                    OPENVDB_ASSERT(bounds.isInside(c));
                     const Index offset = ij + /*k*/(c.z() & (DIM-1u));
                     if (!mask.isOn(offset)) continue;
                     const RealT z = static_cast<RealT>(c.z()-ijk.z());
@@ -213,7 +213,7 @@ struct CellCenteredTransfer :
         else              intersectBox.max().y() += 1;
         if (P.z() < 0.0f) intersectBox.min().z() -= 1;
         else              intersectBox.max().z() += 1;
-        assert(intersectBox.volume() == 8);
+        OPENVDB_ASSERT(intersectBox.volume() == 8);
 
         intersectBox.intersect(bounds);
         if (intersectBox.empty()) return;
@@ -237,15 +237,15 @@ struct CellCenteredTransfer :
                 centerw[1] = value(P.y() - y);
 
                 for (c.z() = a.z(); c.z() <= b.z(); ++c.z()) {
-                    assert(bounds.isInside(c));
+                    OPENVDB_ASSERT(bounds.isInside(c));
                     const Index offset = ij + /*k*/(c.z() & (DIM-1u));
                     if (!mask.isOn(offset)) continue;
                     const RealT z = static_cast<RealT>(c.z()-ijk.z());
                     centerw[2] = value(P.z() - z);
 
-                    assert(centerw[0] >= 0.0f && centerw[0] <= 1.0f);
-                    assert(centerw[1] >= 0.0f && centerw[1] <= 1.0f);
-                    assert(centerw[2] >= 0.0f && centerw[2] <= 1.0f);
+                    OPENVDB_ASSERT(centerw[0] >= 0.0f && centerw[0] <= 1.0f);
+                    OPENVDB_ASSERT(centerw[1] >= 0.0f && centerw[1] <= 1.0f);
+                    OPENVDB_ASSERT(centerw[2] >= 0.0f && centerw[2] <= 1.0f);
 
                     const RealT weight = centerw.product();
                     data[offset] += s * weight;
diff --git a/openvdb/openvdb/points/impl/PointReplicateImpl.h b/openvdb/openvdb/points/impl/PointReplicateImpl.h
index e817fc0c96..5990d98544 100644
--- a/openvdb/openvdb/points/impl/PointReplicateImpl.h
+++ b/openvdb/openvdb/points/impl/PointReplicateImpl.h
@@ -58,8 +58,8 @@ replicate(const PointDataGridT& source,
             return *this;
         }
 
-        Index sourceIndex() const { assert(*this); return mSource; }
-        Index targetIndex() const { assert(*this); return mIt; }
+        Index sourceIndex() const { OPENVDB_ASSERT(*this); return mSource; }
+        Index targetIndex() const { OPENVDB_ASSERT(*this); return mIt; }
 
     private:
         Index mIt, mEnd, mSource;
@@ -82,7 +82,7 @@ replicate(const PointDataGridT& source,
     // verify position
 
     const size_t ppos = sourceDescriptor.find("P");
-    assert(ppos != AttributeSet::INVALID_POS);
+    OPENVDB_ASSERT(ppos != AttributeSet::INVALID_POS);
 
     // build new dummy attribute set
 
@@ -132,7 +132,7 @@ replicate(const PointDataGridT& source,
         const auto& sourceLeaf = sourceManager.leaf(pos);
         // @note  This really shoudn't return uint64_t as AttributeArray's size is
         //  limited to the max of a uint32_t...
-        assert(sourceLeaf.pointCount() < Index64(std::numeric_limits<Index>::max()));
+        OPENVDB_ASSERT(sourceLeaf.pointCount() < Index64(std::numeric_limits<Index>::max()));
         const Index sourceCount = static_cast<Index>(sourceLeaf.pointCount());
 
         Index uniformMultiplier = multiplier;
@@ -193,13 +193,13 @@ replicate(const PointDataGridT& source,
         auto copy = [&](const std::string& name)
         {
             const auto* sourceArray = sourceSet.getConst(name);
-            assert(sourceArray);
+            OPENVDB_ASSERT(sourceArray);
 
             // manually expand so that copyValues() doesn't expand and fill the array -
             // we don't want to unnecessarily zero initialize the target values as we know
             // we're going to write to all of them.
             auto* array = newSet->get(name);
-            assert(array);
+            OPENVDB_ASSERT(array);
             array->expand(/*fill*/false);
 
             if (useScale) {
@@ -225,7 +225,7 @@ replicate(const PointDataGridT& source,
             AttributeWriteHandle<int32_t>
                 idxHandle(*newSet->get(replicationIdx), /*expand*/false);
             idxHandle.expand(/*fill*/false);
-            assert(idxHandle.size() == total);
+            OPENVDB_ASSERT(idxHandle.size() == total);
 
 
             Index offset = 0;
diff --git a/openvdb/openvdb/points/impl/PointSampleImpl.h b/openvdb/openvdb/points/impl/PointSampleImpl.h
index 0b46c78ae4..851e448248 100644
--- a/openvdb/openvdb/points/impl/PointSampleImpl.h
+++ b/openvdb/openvdb/points/impl/PointSampleImpl.h
@@ -354,7 +354,7 @@ inline void sampleGrid( size_t order,
             // append attribute of source grid value type
             appendAttribute<typename SourceGridT::ValueType>(points.tree(), attribute);
             targetIndex = leaf->attributeSet().descriptor().find(attribute);
-            assert(targetIndex != AttributeSet::INVALID_POS);
+            OPENVDB_ASSERT(targetIndex != AttributeSet::INVALID_POS);
 
             // sample using same type as source grid
             pointDataSampler.template sample<SourceGridT>(sourceGrid, Index(targetIndex));
@@ -394,7 +394,7 @@ inline void sampleGrid( size_t order,
             // (point_sample_internal wrapper disables the ability to use DummySampleType)
             AppendAttributeOp<PointDataGridT, TargetValueT>::append(points, attribute);
             targetIndex = leaf->attributeSet().descriptor().find(attribute);
-            assert(targetIndex != AttributeSet::INVALID_POS);
+            OPENVDB_ASSERT(targetIndex != AttributeSet::INVALID_POS);
         }
         else {
             const Name targetType = typeNameAsString<TargetValueT>();
diff --git a/openvdb/openvdb/points/impl/PointScatterImpl.h b/openvdb/openvdb/points/impl/PointScatterImpl.h
index ac95c6af17..842dc6a40b 100644
--- a/openvdb/openvdb/points/impl/PointScatterImpl.h
+++ b/openvdb/openvdb/points/impl/PointScatterImpl.h
@@ -141,7 +141,7 @@ uniformPointScatter(const GridT& grid,
 
     LeafManagerT leafManager(tree);
     const Index64 voxelCount = leafManager.activeLeafVoxelCount();
-    assert(voxelCount != 0);
+    OPENVDB_ASSERT(voxelCount != 0);
 
     const double pointsPerVolume = double(count) / double(voxelCount);
     const Index32 pointsPerVoxel = static_cast<Index32>(math::RoundDown(pointsPerVolume));
@@ -177,7 +177,7 @@ uniformPointScatter(const GridT& grid,
     {
         const Index64 lowerOffset = voxelOffsets[idx]; // inclusive
         const Index64 upperOffset = voxelOffsets[idx + 1]; // exclusive
-        assert(upperOffset > lowerOffset);
+        OPENVDB_ASSERT(upperOffset > lowerOffset);
 
         const auto valuesEnd = values.end();
         auto lower = std::lower_bound(values.begin(), valuesEnd, lowerOffset);
@@ -194,7 +194,7 @@ uniformPointScatter(const GridT& grid,
             const Index32 nextOffset = Index32(vId - lowerOffset);
             iter.increment(nextOffset - currentOffset);
             currentOffset = nextOffset;
-            assert(iter);
+            OPENVDB_ASSERT(iter);
 
             auto& value = data[iter.pos()];
             value = value + 1; // no += operator support
@@ -235,7 +235,7 @@ uniformPointScatter(const GridT& grid,
         }
 
         // offset should always be non zero
-        assert(offset != 0);
+        OPENVDB_ASSERT(offset != 0);
         point_scatter_internal::generatePositions<PositionType, CodecType>
             (*leaf, descriptor, offset, spread, rand01);
     }
diff --git a/openvdb/openvdb/points/impl/PointStatisticsImpl.h b/openvdb/openvdb/points/impl/PointStatisticsImpl.h
index 9fad4b0f01..6dd9422164 100644
--- a/openvdb/openvdb/points/impl/PointStatisticsImpl.h
+++ b/openvdb/openvdb/points/impl/PointStatisticsImpl.h
@@ -172,7 +172,7 @@ bool evalExtents(const PointDataTreeT& points,
                     const size_t size = handle.isUniform() ? 1 : handle.size();
                     ExtentOp op(handle.get(0));
                     for (size_t i = 1; i < size; ++i) {
-                        assert(i < size_t(std::numeric_limits<Index>::max()));
+                        OPENVDB_ASSERT(i < size_t(std::numeric_limits<Index>::max()));
                         op(handle.get(Index(i)));
                     }
                     if (!values.empty()) {
@@ -316,7 +316,7 @@ bool evalAverage(const PointDataTreeT& points,
 
         void add(const Sample& other)
         {
-            assert(other.size > 0);
+            OPENVDB_ASSERT(other.size > 0);
             const double denom = 1.0 / static_cast<double>(size + other.size);
             const ResultT delta = other.avg - avg;
             avg = avg + (denom * delta * static_cast<double>(other.size));
@@ -352,7 +352,7 @@ bool evalAverage(const PointDataTreeT& points,
                     }
                     else {
                         for (size_t i = 1; i < size; ++i) {
-                            assert(i < size_t(std::numeric_limits<Index>::max()));
+                            OPENVDB_ASSERT(i < size_t(std::numeric_limits<Index>::max()));
                             S->add(ResultT(handle.get(Index(i))));
                         }
                     }
@@ -374,7 +374,7 @@ bool evalAverage(const PointDataTreeT& points,
     auto iter = values.cbegin();
     while (iter != values.cend() && !(*iter)) ++iter;
     if (iter == values.cend()) return false;
-    assert(*iter);
+    OPENVDB_ASSERT(*iter);
 
     // serial deterministic reduction of floating point samples
     Sample S = **iter;
@@ -436,7 +436,7 @@ bool accumulate(const PointDataTreeT& points,
                     const size_t size = handle.isUniform() ? 1 : handle.size();
                     auto total = ResultT(handle.get(0));
                     for (size_t i = 1; i < size; ++i) {
-                        assert(i < size_t(std::numeric_limits<Index>::max()));
+                        OPENVDB_ASSERT(i < size_t(std::numeric_limits<Index>::max()));
                         total += ResultT(handle.get(Index(i)));
                     }
                     values[leaf.pos()].reset(new ResultT(total));
@@ -455,7 +455,7 @@ bool accumulate(const PointDataTreeT& points,
     auto iter = values.cbegin();
     while (iter != values.cend() && !(*iter)) ++iter;
     if (iter == values.cend()) return false;
-    assert(*iter);
+    OPENVDB_ASSERT(*iter);
     total = **iter; ++iter;
 
     if (std::is_integral<ElementT>::value) {
diff --git a/openvdb/openvdb/python/pyGrid.h b/openvdb/openvdb/python/pyGrid.h
index 9249eaf6d2..539bf42991 100644
--- a/openvdb/openvdb/python/pyGrid.h
+++ b/openvdb/openvdb/python/pyGrid.h
@@ -10,6 +10,7 @@
 
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
+#include <pybind11/functional.h>
 #ifdef PY_OPENVDB_USE_NUMPY
 #include <pybind11/numpy.h>
 #include <openvdb/tools/MeshToVolume.h>
@@ -942,30 +943,20 @@ struct TreeCombineOp
     using TreeT = typename GridType::TreeType;
     using ValueT = typename GridType::ValueType;
 
-    TreeCombineOp(py::function _op): op(_op) {}
+    TreeCombineOp(const std::function<typename GridType::ValueType(typename GridType::ValueType, typename GridType::ValueType)>& _op): op(_op) {}
     void operator()(const ValueT& a, const ValueT& b, ValueT& result)
     {
-        py::object resultObj = op(a, b);
-
-        if (!py::isinstance<py::float_>(resultObj)) {
-            std::ostringstream os;
-            os << "expected callable argument to " << pyutil::GridTraits<GridType>::name();
-            os << ".combine() to return " << openvdb::typeNameAsString<ValueT>();
-            os << ", found " <<  pyutil::className(resultObj);
-            throw py::type_error(os.str());
-        }
-
-        result = py::cast<ValueT>(resultObj);
+        result = op(a, b);
     }
-    py::function op;
+    const std::function<typename GridType::ValueType(typename GridType::ValueType, typename GridType::ValueType)>& op;
 };
 
 
 template<typename GridType>
 inline void
-combine(GridType& grid, GridType& otherGrid, py::function funcObj)
+combine(GridType& grid, GridType& otherGrid, const std::function<typename GridType::ValueType(typename GridType::ValueType, typename GridType::ValueType)>& func)
 {
-    TreeCombineOp<GridType> op(funcObj);
+    TreeCombineOp<GridType> op(func);
     grid.tree().combine(otherGrid.tree(), op, /*prune=*/true);
 }
 
@@ -1346,7 +1337,7 @@ class IterWrap
             .def("__getitem__", &IterValueProxyT::getItem,
                 "__getitem__(key) -> value\n\n"
                 "Return the value of the item with the given key.")
-            .def("__setitem__", &IterValueProxyT::getItem,
+            .def("__setitem__", &IterValueProxyT::setItem,
                 "__setitem__(key, value)\n\n"
                 "Set the value of the item with the given key.");
     }
diff --git a/openvdb/openvdb/python/pyOpenVDBModule.cc b/openvdb/openvdb/python/pyOpenVDBModule.cc
index ed6754ac56..0575919ebc 100644
--- a/openvdb/openvdb/python/pyOpenVDBModule.cc
+++ b/openvdb/openvdb/python/pyOpenVDBModule.cc
@@ -371,10 +371,6 @@ PYBIND11_MODULE(PY_OPENVDB_MODULE_NAME, m)
 
 #undef PYOPENVDB_TRANSLATE_EXCEPTION
 
-    py::class_<openvdb::PointDataIndex32>(m, "PointDataIndex32")
-        .def(py::init<openvdb::Index32>(), py::arg("i") = openvdb::Index32(0));
-
-
     // Export the python bindings.
     exportTransform(m);
     exportMetadata(m);
diff --git a/openvdb/openvdb/tools/Activate.h b/openvdb/openvdb/tools/Activate.h
index a01b0facb6..c23b09acc3 100644
--- a/openvdb/openvdb/tools/Activate.h
+++ b/openvdb/openvdb/tools/Activate.h
@@ -87,6 +87,10 @@ struct ActivateOp
         // only iterate if there are inactive tiles
         if (!node.isValueMaskOn()) {
             for (auto it = node.beginValueOff(); it; ++it) {
+                // Skip child nodes, they'll be processed separately
+                // (InteralNode ValueOff iterators don't automatically
+                // skip these).
+                if (node.isChildMaskOn(it.pos())) continue;
                 if (check(*it))     it.setValueOn(/*on=*/true);
             }
         }
diff --git a/openvdb/openvdb/tools/Dense.h b/openvdb/openvdb/tools/Dense.h
index e42dc52fad..29a458fb7f 100644
--- a/openvdb/openvdb/tools/Dense.h
+++ b/openvdb/openvdb/tools/Dense.h
@@ -14,6 +14,7 @@
 #include <openvdb/tree/ValueAccessor.h>
 #include <openvdb/Exceptions.h>
 #include <openvdb/util/Formats.h>
+#include <openvdb/util/Assert.h>
 #include "Prune.h"
 #include <tbb/parallel_for.h>
 #include <iostream>
@@ -326,7 +327,7 @@ class Dense : public DenseBase<ValueT, Layout>
     /// layout of values as an OpenVDB grid, i.e., the fastest coordinate is @e z.
     inline size_t coordToOffset(const Coord& xyz) const
     {
-        assert(BaseT::mBBox.isInside(xyz));
+        OPENVDB_ASSERT(BaseT::mBBox.isInside(xyz));
         return BaseT::coordToOffset(size_t(xyz[0]-BaseT::mBBox.min()[0]),
                                     size_t(xyz[1]-BaseT::mBBox.min()[1]),
                                     size_t(xyz[2]-BaseT::mBBox.min()[2]));
@@ -514,7 +515,7 @@ class CopyFromDense
     /// @warning Never call this method directly!
     void operator()(const tbb::blocked_range<size_t> &r) const
     {
-        assert(mBlocks);
+        OPENVDB_ASSERT(mBlocks);
         LeafT* leaf = new LeafT();
 
         for (size_t m=r.begin(), n=0, end = r.end(); m != end; ++m, ++n) {
diff --git a/openvdb/openvdb/tools/FastSweeping.h b/openvdb/openvdb/tools/FastSweeping.h
index 283013ade1..c457700f77 100644
--- a/openvdb/openvdb/tools/FastSweeping.h
+++ b/openvdb/openvdb/tools/FastSweeping.h
@@ -35,6 +35,7 @@
 #include <openvdb/math/Stencils.h> // for GradStencil
 #include <openvdb/tree/LeafManager.h>
 #include <openvdb/tree/NodeManager.h> // for PruneMinMaxFltKernel
+#include <openvdb/util/Assert.h>
 
 #include "LevelSetUtil.h"
 #include "Morphology.h"
@@ -756,7 +757,7 @@ void FastSweeping<SdfGridT, ExtValueT>::computeSweepMaskLeafOrigins()
     mSweepingVoxelCount = sweepingVoxelCount;
     if (mSdfGrid) {
         const size_t totalCount = mSdfGrid->constTree().activeVoxelCount();
-        assert( totalCount >= mSweepingVoxelCount );
+        OPENVDB_ASSERT( totalCount >= mSweepingVoxelCount );
         mBoundaryVoxelCount = totalCount - mSweepingVoxelCount;
     }
 }// FastSweeping::computeSweepMaskLeafOrigins
@@ -1061,7 +1062,7 @@ struct FastSweeping<SdfGridT, ExtValueT>::DilateKernel
             const SdfValueT background = mBackground;//local copy
             auto* maskLeaf = mParent->mSweepMask.probeLeaf(leaf.origin());
             SdfConstAccT sdfInputAcc(mSdfGridInput->tree());
-            assert(maskLeaf);
+            OPENVDB_ASSERT(maskLeaf);
             for (auto voxelIter = leaf.beginValueOn(); voxelIter; ++voxelIter) {
                 const SdfValueT value = *voxelIter;
                 SdfValueT inputValue;
@@ -1636,7 +1637,7 @@ struct FastSweeping<SdfGridT, ExtValueT>::SweepingKernel
         // If we are using an extension in one direction, we need a reference grid
         // for the default value of the extension for the voxels that are not
         // intended to be updated by the sweeping algorithm.
-        if (tree2 && mode != FastSweepingDomain::SWEEP_ALL) assert(tree3);
+        if (tree2 && mode != FastSweepingDomain::SWEEP_ALL) OPENVDB_ASSERT(tree3);
 
         const std::vector<Coord>& leafNodeOrigins = mParent->mSweepMaskLeafOrigins;
 
diff --git a/openvdb/openvdb/tools/Filter.h b/openvdb/openvdb/tools/Filter.h
index de20a6595e..8f9e1eef69 100644
--- a/openvdb/openvdb/tools/Filter.h
+++ b/openvdb/openvdb/tools/Filter.h
@@ -23,6 +23,7 @@
 #include <openvdb/tree/LeafManager.h>
 #include <openvdb/util/NullInterrupter.h>
 #include <openvdb/util/Util.h>
+#include <openvdb/util/Assert.h>
 #include <openvdb/thread/Threading.h>
 #include "Interpolation.h"
 
@@ -271,7 +272,7 @@ struct Voxelizer
 
         // do nothing for leaf nodes. They shouldn't even be cached as
         // part of the NodeManager used with this method.
-        void operator()(const LeafT&) const { assert(false); }
+        void operator()(const LeafT&) const { OPENVDB_ASSERT(false); }
 
         void operator()(const RootT& node) const
         {
diff --git a/openvdb/openvdb/tools/FindActiveValues.h b/openvdb/openvdb/tools/FindActiveValues.h
index a8414a3919..4c30bea0f5 100644
--- a/openvdb/openvdb/tools/FindActiveValues.h
+++ b/openvdb/openvdb/tools/FindActiveValues.h
@@ -386,7 +386,7 @@ typename NodeT::NodeMaskType FindActiveValues<TreeT>::getBBoxMask(const CoordBBo
 {
     typename NodeT::NodeMaskType mask;// typically 32^3 or 16^3 bit mask
     auto b = node->getNodeBoundingBox();
-    assert( bbox.hasOverlap(b) );
+    OPENVDB_ASSERT( bbox.hasOverlap(b) );
     if ( bbox.isInside(b) ) {
         mask.setOn();//node is completely inside the bbox so early out
     } else {
@@ -396,7 +396,7 @@ typename NodeT::NodeMaskType FindActiveValues<TreeT>::getBBoxMask(const CoordBBo
         b.min() >>= NodeT::ChildNodeType::TOTAL;
         b.max() &=  NodeT::DIM-1u;
         b.max() >>= NodeT::ChildNodeType::TOTAL;
-        assert( b.hasVolume() );
+        OPENVDB_ASSERT( b.hasVolume() );
         auto it = b.begin();// iterates over all the child nodes or tiles that intersects bbox
         for (const Coord& ijk = *it; it; ++it) {
             mask.setOn(ijk[2] + (ijk[1] << NodeT::LOG2DIM) + (ijk[0] << 2*NodeT::LOG2DIM));
@@ -619,9 +619,9 @@ struct TileData
         , level(parent.getLevel())
         , state(true)
     {
-        assert(childIdx < ParentNodeT::NUM_VALUES);
-        assert(parent.isChildMaskOff(childIdx));
-        assert(parent.isValueMaskOn(childIdx));
+        OPENVDB_ASSERT(childIdx < ParentNodeT::NUM_VALUES);
+        OPENVDB_ASSERT(parent.isChildMaskOff(childIdx));
+        OPENVDB_ASSERT(parent.isValueMaskOn(childIdx));
         value = parent.getTable()[childIdx].getValue();
     }
 
diff --git a/openvdb/openvdb/tools/Interpolation.h b/openvdb/openvdb/tools/Interpolation.h
index 989ab1966f..5a3cbc2eca 100644
--- a/openvdb/openvdb/tools/Interpolation.h
+++ b/openvdb/openvdb/tools/Interpolation.h
@@ -46,6 +46,7 @@
 #include <openvdb/math/Transform.h> // for Transform
 #include <openvdb/Grid.h>
 #include <openvdb/tree/ValueAccessor.h>
+#include <openvdb/util/Assert.h>
 #include <cmath>
 #include <type_traits>
 
@@ -557,7 +558,7 @@ class AlphaMask
         , mInvNorm(1/(max-min))
         , mInvert(invert)
     {
-        assert(min < max);
+        OPENVDB_ASSERT(min < max);
     }
 
     inline bool operator()(const Coord& xyz, FloatT& a, FloatT& b) const
diff --git a/openvdb/openvdb/tools/LevelSetAdvect.h b/openvdb/openvdb/tools/LevelSetAdvect.h
index 26942acca6..4fe44d3450 100644
--- a/openvdb/openvdb/tools/LevelSetAdvect.h
+++ b/openvdb/openvdb/tools/LevelSetAdvect.h
@@ -16,6 +16,7 @@
 #include "VelocityFields.h" // for EnrightField
 #include <openvdb/Platform.h>
 #include <openvdb/math/FiniteDifference.h>
+#include <openvdb/util/Assert.h>
 //#include <openvdb/util/CpuTimer.h>
 #include <functional>
 
@@ -442,7 +443,7 @@ sampleField(ValueType time0, ValueType time1)
     } else {
         mTask = std::bind(&Advect::sampleXformed, ph::_1, ph::_2, time0, time1);
     }
-    assert(voxelCount == mParent.mTracker.grid().activeVoxelCount());
+    OPENVDB_ASSERT(voxelCount == mParent.mTracker.grid().activeVoxelCount());
     mVelocity = new VectorType[ voxelCount ];
     this->cook("Sampling advection field");
 
diff --git a/openvdb/openvdb/tools/Merge.h b/openvdb/openvdb/tools/Merge.h
index 17b9ffe61f..e56306aea5 100644
--- a/openvdb/openvdb/tools/Merge.h
+++ b/openvdb/openvdb/tools/Merge.h
@@ -16,6 +16,7 @@
 #include <openvdb/Grid.h>
 #include <openvdb/tree/NodeManager.h>
 #include <openvdb/openvdb.h>
+#include <openvdb/util/Assert.h>
 
 #include "NodeVisitor.h"
 
@@ -449,7 +450,7 @@ void
 TreeToMerge<TreeT>::pruneMask(Index level, const Coord& ijk)
 {
     if (!mSteal) {
-        assert(this->hasMask());
+        OPENVDB_ASSERT(this->hasMask());
         this->mask()->addTile(level, ijk, false, false);
     }
 }
@@ -1042,7 +1043,7 @@ const typename CsgUnionOrIntersectionOp<TreeT, Union>::ValueT&
 CsgUnionOrIntersectionOp<TreeT, Union>::background() const
 {
     // this operator is only intended to be used with foreachTopDown()
-    assert(mBackground);
+    OPENVDB_ASSERT(mBackground);
     return *mBackground;
 }
 
@@ -1254,7 +1255,7 @@ const typename CsgDifferenceOp<TreeT>::ValueT&
 CsgDifferenceOp<TreeT>::background() const
 {
     // this operator is only intended to be used with foreachTopDown()
-    assert(mBackground);
+    OPENVDB_ASSERT(mBackground);
     return *mBackground;
 }
 
@@ -1263,7 +1264,7 @@ const typename CsgDifferenceOp<TreeT>::ValueT&
 CsgDifferenceOp<TreeT>::otherBackground() const
 {
     // this operator is only intended to be used with foreachTopDown()
-    assert(mOtherBackground);
+    OPENVDB_ASSERT(mOtherBackground);
     return *mOtherBackground;
 }
 
@@ -1547,7 +1548,7 @@ const typename SumMergeOp<TreeT>::ValueT&
 SumMergeOp<TreeT>::background() const
 {
     // this operator is only intended to be used with foreachTopDown()
-    assert(mBackground);
+    OPENVDB_ASSERT(mBackground);
     return *mBackground;
 }
 
diff --git a/openvdb/openvdb/tools/MeshToVolume.h b/openvdb/openvdb/tools/MeshToVolume.h
index 081170055b..931d01a0f9 100644
--- a/openvdb/openvdb/tools/MeshToVolume.h
+++ b/openvdb/openvdb/tools/MeshToVolume.h
@@ -22,6 +22,7 @@
 #include <openvdb/math/Proximity.h> // for closestPointOnTriangleToPoint
 #include <openvdb/util/NullInterrupter.h>
 #include <openvdb/util/Util.h>
+#include <openvdb/util/Assert.h>
 #include <openvdb/thread/Threading.h>
 #include <openvdb/openvdb.h>
 
@@ -933,7 +934,7 @@ class SweepExteriorSign
 
         for (Index i = 0; i < LeafNodeType::DIM; ++i) {
 
-            assert(pos >= 0);
+            OPENVDB_ASSERT(pos >= 0);
             ValueType& dist = data[pos];
 
             if (dist < ValueType(0.0)) {
@@ -1975,7 +1976,7 @@ struct VoxelizationData {
             mPrimCount = 0;
             primIdTree.root().clear();
             primIdTree.clearAllAccessors();
-            assert(mPrimCount == 0);
+            OPENVDB_ASSERT(mPrimCount == 0);
         }
 
         return mPrimCount++;
@@ -2501,7 +2502,7 @@ struct ExpandNarrowband
             LeafNodeType      * distNodePt = distAcc.probeLeaf(origin);
             Int32LeafNodeType * indexNodePt = indexAcc.probeLeaf(origin);
 
-            assert(!distNodePt == !indexNodePt);
+            OPENVDB_ASSERT(!distNodePt == !indexNodePt);
 
             bool usingNewNodes = false;
 
diff --git a/openvdb/openvdb/tools/Morphology.h b/openvdb/openvdb/tools/Morphology.h
index 78dc938b10..4780d04574 100644
--- a/openvdb/openvdb/tools/Morphology.h
+++ b/openvdb/openvdb/tools/Morphology.h
@@ -26,6 +26,7 @@
 #include <openvdb/tree/LeafManager.h>
 #include <openvdb/openvdb.h>
 #include <openvdb/points/PointDataGrid.h>
+#include <openvdb/util/Assert.h>
 
 #include <tbb/task_arena.h>
 #include <tbb/enumerable_thread_specific.h>
@@ -299,7 +300,7 @@ class Morphology
                 case NN_FACE_EDGE_VERTEX : { this->dilate26(mask); return; }
                 case NN_FACE             : { this->dilate6(mask);  return; }
                 default                  : {
-                    assert(false && "Unknown op during dilation."); return;
+                    OPENVDB_ASSERT(false && "Unknown op during dilation."); return;
                 }
             }
         }
@@ -345,7 +346,7 @@ class Morphology
                 case NN_FACE_EDGE_VERTEX : { this->erode26(mask); return; }
                 case NN_FACE             : { this->erode6(mask);  return; }
                 default                  : {
-                    assert(false && "Unknown op during erosion."); return;
+                    OPENVDB_ASSERT(false && "Unknown op during erosion."); return;
                 }
             }
         }
@@ -378,30 +379,30 @@ class Morphology
 
         inline void scatter(size_t n, int indx)
         {
-            assert(n < mNeighbors.size());
-            assert(mNeighbors[n]);
+            OPENVDB_ASSERT(n < mNeighbors.size());
+            OPENVDB_ASSERT(mNeighbors[n]);
             mNeighbors[n]->template getWord<Word>(indx) |= mWord;
 
         }
         template<int DX, int DY, int DZ>
         inline void scatter(size_t n, int indx)
         {
-            assert(n < mNeighbors.size());
+            OPENVDB_ASSERT(n < mNeighbors.size());
             if (!mNeighbors[n]) {
                 mNeighbors[n] = this->getNeighbor<DX,DY,DZ,true>();
             }
-            assert(mNeighbors[n]);
+            OPENVDB_ASSERT(mNeighbors[n]);
             this->scatter(n, indx - (DIM - 1)*(DY + DX*DIM));
         }
         inline Word gather(size_t n, int indx)
         {
-            assert(n < mNeighbors.size());
+            OPENVDB_ASSERT(n < mNeighbors.size());
             return mNeighbors[n]->template getWord<Word>(indx);
         }
         template<int DX, int DY, int DZ>
         inline Word gather(size_t n, int indx)
         {
-            assert(n < mNeighbors.size());
+            OPENVDB_ASSERT(n < mNeighbors.size());
             if (!mNeighbors[n]) {
                 mNeighbors[n] = this->getNeighbor<DX,DY,DZ,false>();
             }
@@ -502,7 +503,7 @@ void Morphology<TreeType>::erodeVoxels(const size_t iter,
                 // original tree (it was previous possible when dilateVoxels()
                 // called topologyUnion without the preservation of active
                 // tiles)
-                assert(!nodemask.isOn());
+                OPENVDB_ASSERT(!nodemask.isOn());
             }
         };
 
@@ -527,7 +528,7 @@ void Morphology<TreeType>::erodeVoxels(const size_t iter,
         auto subtractTopology = [&](const size_t idx) {
             auto& leaf = mManager.leaf(idx);
             const auto* maskleaf = mask.probeConstLeaf(leaf.origin());
-            assert(maskleaf);
+            OPENVDB_ASSERT(maskleaf);
             leaf.getValueMask() -= maskleaf->getValueMask();
         };
 
@@ -1093,7 +1094,7 @@ void dilateActiveValues(TreeOrLeafManagerT& treeOrLeafM,
             morph.dilateVoxels(static_cast<size_t>(iterations), nn, /*prune=*/true);
         }
         else {
-            assert(mode == EXPAND_TILES);
+            OPENVDB_ASSERT(mode == EXPAND_TILES);
             morph.dilateVoxels(static_cast<size_t>(iterations), nn, /*prune=*/false);
         }
         return;
@@ -1108,7 +1109,7 @@ void dilateActiveValues(TreeOrLeafManagerT& treeOrLeafM,
     // Note that we also always use a mask if the tile policy is PRESERVE_TILES
     // due to the way the underlying dilation only works on voxels.
     // @todo Investigate tile based dilation
-    assert(mode == PRESERVE_TILES);
+    OPENVDB_ASSERT(mode == PRESERVE_TILES);
 
     MaskT topology;
     topology.topologyUnion(tree);
diff --git a/openvdb/openvdb/tools/MultiResGrid.h b/openvdb/openvdb/tools/MultiResGrid.h
index 26800fc9d2..4260cf48ce 100644
--- a/openvdb/openvdb/tools/MultiResGrid.h
+++ b/openvdb/openvdb/tools/MultiResGrid.h
@@ -34,6 +34,7 @@
 #include <openvdb/Metadata.h>
 #include <openvdb/tree/LeafManager.h>
 #include <openvdb/tree/NodeManager.h>
+#include <openvdb/util/Assert.h>
 
 #include "Interpolation.h"
 #include "Morphology.h"
@@ -396,7 +397,7 @@ template<typename TreeType>
 inline TreeType& MultiResGrid<TreeType>::
 tree(size_t level)
 {
-    assert( level < mTrees.size() );
+    OPENVDB_ASSERT( level < mTrees.size() );
     return *mTrees[level];
 }
 
@@ -404,7 +405,7 @@ template<typename TreeType>
 inline const TreeType& MultiResGrid<TreeType>::
 constTree(size_t level) const
 {
-    assert( level < mTrees.size() );
+    OPENVDB_ASSERT( level < mTrees.size() );
     return *mTrees[level];
 }
 
@@ -412,7 +413,7 @@ template<typename TreeType>
 inline typename TreeType::Ptr MultiResGrid<TreeType>::
 treePtr(size_t level)
 {
-    assert( level < mTrees.size() );
+    OPENVDB_ASSERT( level < mTrees.size() );
     return mTrees[level];
 }
 
@@ -420,7 +421,7 @@ template<typename TreeType>
 inline typename TreeType::ConstPtr MultiResGrid<TreeType>::
 constTreePtr(size_t level) const
 {
-    assert( level < mTrees.size() );
+    OPENVDB_ASSERT( level < mTrees.size() );
     return mTrees[level];
 }
 
@@ -452,7 +453,7 @@ template<Index Order>
 typename Grid<TreeType>::Ptr MultiResGrid<TreeType>::
 createGrid(float level, size_t grainSize) const
 {
-    assert( level >= 0.0f && level <= float(mTrees.size()-1) );
+    OPENVDB_ASSERT( level >= 0.0f && level <= float(mTrees.size()-1) );
 
     typename Grid<TreeType>::Ptr grid(new Grid<TreeType>(this->constTree(0).background()));
     math::Transform::Ptr xform = mTransform->copy();
@@ -522,8 +523,8 @@ template<Index Order>
 typename TreeType::ValueType MultiResGrid<TreeType>::
 sampleValue(const Coord& in_ijk, size_t in_level, size_t out_level) const
 {
-    assert( in_level  < mTrees.size() );
-    assert( out_level < mTrees.size() );
+    OPENVDB_ASSERT( in_level  < mTrees.size() );
+    OPENVDB_ASSERT( out_level < mTrees.size() );
     const ConstAccessor acc(*mTrees[out_level]);// has disabled registration!
     return tools::Sampler<Order>::sample( acc, this->xyz(in_ijk, in_level, out_level) );
 }
@@ -533,8 +534,8 @@ template<Index Order>
 typename TreeType::ValueType MultiResGrid<TreeType>::
 sampleValue(const Vec3R& in_xyz, size_t in_level, size_t out_level) const
 {
-    assert( in_level  < mTrees.size() );
-    assert( out_level < mTrees.size() );
+    OPENVDB_ASSERT( in_level  < mTrees.size() );
+    OPENVDB_ASSERT( out_level < mTrees.size() );
     const ConstAccessor acc(*mTrees[out_level]);// has disabled registration!
     return tools::Sampler<Order>::sample( acc, this->xyz(in_xyz, in_level, out_level) );
 }
@@ -544,11 +545,11 @@ template<Index Order>
 typename TreeType::ValueType MultiResGrid<TreeType>::
 sampleValue(const Coord& ijk, double level) const
 {
-    assert( level >= 0.0 && level <= double(mTrees.size()-1) );
+    OPENVDB_ASSERT( level >= 0.0 && level <= double(mTrees.size()-1) );
     const size_t level0 = size_t(floor(level)), level1 = size_t(ceil(level));
     const ValueType v0 = this->template sampleValue<Order>( ijk, 0, level0 );
     if ( level0 == level1 ) return v0;
-    assert( level1 - level0 == 1 );
+    OPENVDB_ASSERT( level1 - level0 == 1 );
     const ValueType v1 = this->template sampleValue<Order>( ijk, 0, level1 );
     OPENVDB_NO_TYPE_CONVERSION_WARNING_BEGIN
     const ValueType a = ValueType(level1 - level);
@@ -561,11 +562,11 @@ template<Index Order>
 typename TreeType::ValueType MultiResGrid<TreeType>::
 sampleValue(const Vec3R& xyz, double level) const
 {
-    assert( level >= 0.0 && level <= double(mTrees.size()-1) );
+    OPENVDB_ASSERT( level >= 0.0 && level <= double(mTrees.size()-1) );
     const size_t level0 = size_t(floor(level)), level1 = size_t(ceil(level));
     const ValueType v0 = this->template sampleValue<Order>( xyz, 0, level0 );
     if ( level0 == level1 ) return v0;
-    assert( level1 - level0 == 1 );
+    OPENVDB_ASSERT( level1 - level0 == 1 );
     const ValueType v1 = this->template sampleValue<Order>( xyz, 0, level1 );
     OPENVDB_NO_TYPE_CONVERSION_WARNING_BEGIN
     const ValueType a = ValueType(level1 - level);
@@ -577,7 +578,7 @@ template<typename TreeType>
 typename TreeType::ValueType MultiResGrid<TreeType>::
 prolongateVoxel(const Coord& ijk, const size_t level) const
 {
-    assert( level+1 < mTrees.size() );
+    OPENVDB_ASSERT( level+1 < mTrees.size() );
     const ConstAccessor acc(*mTrees[level + 1]);// has disabled registration!
     return ProlongateOp::run(ijk, acc);
 }
@@ -586,7 +587,7 @@ template<typename TreeType>
 void MultiResGrid<TreeType>::
 prolongateActiveVoxels(size_t destlevel, size_t grainSize)
 {
-    assert( destlevel < mTrees.size()-1 );
+    OPENVDB_ASSERT( destlevel < mTrees.size()-1 );
     TreeType &fineTree = *mTrees[ destlevel ];
     const TreeType &coarseTree = *mTrees[ destlevel+1 ];
     CookOp<ProlongateOp> tmp( coarseTree, fineTree, grainSize );
@@ -596,7 +597,7 @@ template<typename TreeType>
 typename TreeType::ValueType MultiResGrid<TreeType>::
 restrictVoxel(Coord ijk, const size_t destlevel, bool useInjection) const
 {
-    assert( destlevel > 0 && destlevel < mTrees.size() );
+    OPENVDB_ASSERT( destlevel > 0 && destlevel < mTrees.size() );
     const TreeType &fineTree = *mTrees[ destlevel-1 ];
     if ( useInjection ) return fineTree.getValue(ijk<<1);
     const ConstAccessor acc( fineTree );// has disabled registration!
@@ -607,7 +608,7 @@ template<typename TreeType>
 void MultiResGrid<TreeType>::
 restrictActiveVoxels(size_t destlevel, size_t grainSize)
 {
-    assert( destlevel > 0 && destlevel < mTrees.size() );
+    OPENVDB_ASSERT( destlevel > 0 && destlevel < mTrees.size() );
     const TreeType &fineTree = *mTrees[ destlevel-1 ];
     TreeType &coarseTree = *mTrees[ destlevel ];
     CookOp<RestrictOp> tmp( fineTree, coarseTree, grainSize );
@@ -689,7 +690,7 @@ struct MultiResGrid<TreeType>::MaskOp
     MaskOp(const TreeType& fineTree, TreeType& coarseTree, size_t grainSize = 1)
         : mPool(new PoolType( coarseTree ) )// empty coarse tree acts as examplar
     {
-        assert( coarseTree.empty() );
+        OPENVDB_ASSERT( coarseTree.empty() );
 
         // Create Mask of restruction performed on fineTree
         MaskT mask(fineTree, false, true, TopologyCopy() );
@@ -741,8 +742,8 @@ struct MultiResGrid<TreeType>::FractionOp
         , mTree0( &*(parent.mTrees[size_t(floorf(level))]) )//high-resolution
         , mTree1( &*(parent.mTrees[size_t(ceilf(level))]) ) //low-resolution
     {
-        assert( midTree.empty() );
-        assert( mTree0 != mTree1 );
+        OPENVDB_ASSERT( midTree.empty() );
+        OPENVDB_ASSERT( mTree0 != mTree1 );
 
         // Create a pool of  thread-local masks
         MaskT examplar( false );
diff --git a/openvdb/openvdb/tools/ParticlesToLevelSet.h b/openvdb/openvdb/tools/ParticlesToLevelSet.h
index be78d533dd..15baffd329 100644
--- a/openvdb/openvdb/tools/ParticlesToLevelSet.h
+++ b/openvdb/openvdb/tools/ParticlesToLevelSet.h
@@ -69,6 +69,7 @@
 #include <openvdb/tree/LeafManager.h>
 #include <openvdb/util/logging.h>
 #include <openvdb/util/NullInterrupter.h>
+#include <openvdb/util/Assert.h>
 #include <openvdb/thread/Threading.h>
 
 #include "Composite.h" // for csgUnion()
@@ -566,7 +567,7 @@ struct ParticlesToLevelSet<SdfGridT, AttributeT, InterrupterT>::Raster
     /// @brief Kick off the optionally multithreaded computation.
     void operator()(const tbb::blocked_range<size_t>& r)
     {
-        assert(mTask);
+        OPENVDB_ASSERT(mTask);
         mTask(this, r);
         mParent.mMinCount = mMinCount;
         mParent.mMaxCount = mMaxCount;
diff --git a/openvdb/openvdb/tools/PointIndexGrid.h b/openvdb/openvdb/tools/PointIndexGrid.h
index 18f1e55ce9..e502aae55b 100644
--- a/openvdb/openvdb/tools/PointIndexGrid.h
+++ b/openvdb/openvdb/tools/PointIndexGrid.h
@@ -27,6 +27,7 @@
 #include <openvdb/tree/LeafManager.h>
 #include <openvdb/tree/LeafNode.h>
 #include <openvdb/tree/Tree.h>
+#include <openvdb/util/Assert.h>
 
 #include <tbb/blocked_range.h>
 #include <tbb/parallel_for.h>
@@ -1499,7 +1500,7 @@ struct PointIndexLeafNode : public tree::LeafNode<T, Log2Dim>
     // to the point-array offsets.
 
     void assertNonmodifiable() {
-        assert(false && "Cannot modify voxel values in a PointIndexTree.");
+        OPENVDB_ASSERT(false && "Cannot modify voxel values in a PointIndexTree.");
     }
 
     void setActiveState(const Coord&, bool) { assertNonmodifiable(); }
diff --git a/openvdb/openvdb/tools/PointPartitioner.h b/openvdb/openvdb/tools/PointPartitioner.h
index 3853c710bd..1fa80edfda 100644
--- a/openvdb/openvdb/tools/PointPartitioner.h
+++ b/openvdb/openvdb/tools/PointPartitioner.h
@@ -20,6 +20,7 @@
 
 #include <openvdb/Types.h>
 #include <openvdb/math/Transform.h>
+#include <openvdb/util/Assert.h>
 
 #include <tbb/blocked_range.h>
 #include <tbb/parallel_for.h>
@@ -192,15 +193,15 @@ class PointPartitioner<PointIndexType, BucketLog2Dim>::IndexIterator
     size_t size() const { return mEnd - mBegin; }
 
     /// @brief  Returns the item to which this iterator is currently pointing.
-    IndexType& operator*() { assert(mItem != nullptr); return *mItem; }
-    const IndexType& operator*() const { assert(mItem != nullptr); return *mItem; }
+    IndexType& operator*() { OPENVDB_ASSERT(mItem != nullptr); return *mItem; }
+    const IndexType& operator*() const { OPENVDB_ASSERT(mItem != nullptr); return *mItem; }
 
     /// @brief  Return @c true if this iterator is not yet exhausted.
     operator bool() const { return mItem < mEnd; }
     bool test() const { return mItem < mEnd; }
 
     /// @brief  Advance to the next item.
-    IndexIterator& operator++() { assert(this->test()); ++mItem; return *this; }
+    IndexIterator& operator++() { OPENVDB_ASSERT(this->test()); ++mItem; return *this; }
 
     /// @brief  Advance to the next item.
     bool next() { this->operator++(); return this->test(); }
@@ -988,7 +989,7 @@ template<typename PointIndexType, Index BucketLog2Dim>
 inline typename PointPartitioner<PointIndexType, BucketLog2Dim>::IndexIterator
 PointPartitioner<PointIndexType, BucketLog2Dim>::indices(size_t n) const
 {
-    assert(bool(mPointIndices) && bool(mPageCount));
+    OPENVDB_ASSERT(bool(mPointIndices) && bool(mPageCount));
     return IndexIterator(
         mPointIndices.get() + mPageOffsets[n],
         mPointIndices.get() + mPageOffsets[n + 1]);
diff --git a/openvdb/openvdb/tools/PoissonSolver.h b/openvdb/openvdb/tools/PoissonSolver.h
index 183dc00b36..fb2e5642fd 100644
--- a/openvdb/openvdb/tools/PoissonSolver.h
+++ b/openvdb/openvdb/tools/PoissonSolver.h
@@ -66,6 +66,8 @@
 #include <openvdb/tree/LeafManager.h>
 #include <openvdb/tree/Tree.h>
 #include <openvdb/util/NullInterrupter.h>
+#include <openvdb/util/Assert.h>
+
 #include "Morphology.h" // for erodeActiveValues
 #include <openvdb/openvdb.h>
 
@@ -340,7 +342,7 @@ populateIndexTree(VIndexTreeType& result)
     }
 
     // The last accumulated value should be the total of all active voxels.
-    assert(Index64(perLeafCount[leafCount-1]) == result.activeVoxelCount());
+    OPENVDB_ASSERT(Index64(perLeafCount[leafCount-1]) == result.activeVoxelCount());
 
     // Parallelize over the leaf nodes of the tree, storing a unique index
     // in each active voxel.
@@ -461,7 +463,7 @@ struct CopyFromVecOp
     {
         const VectorT& vec = *vector;
         OutLeafT* leaf = tree->probeLeaf(idxLeaf.origin());
-        assert(leaf != nullptr);
+        OPENVDB_ASSERT(leaf != nullptr);
         for (typename VIdxLeafT::ValueOnCIter it = idxLeaf.cbeginValueOn(); it; ++it) {
             leaf->setValueOnly(it.pos(), static_cast<TreeValueType>(vec[*it]));
         }
@@ -533,7 +535,7 @@ struct ISStaggeredLaplacianOp
 
         // Loop over active voxels in this leaf.
         for (typename VIdxLeafT::ValueOnCIter it = idxLeaf.cbeginValueOn(); it; ++it) {
-            assert(it.getValue() > -1);
+            OPENVDB_ASSERT(it.getValue() > -1);
             const math::pcg::SizeType rowNum = static_cast<math::pcg::SizeType>(it.getValue());
 
             LaplacianMatrix::RowEditor row = laplacian->getRowEditor(rowNum);
@@ -650,7 +652,7 @@ struct ISLaplacianOp
 
         // For each active voxel in this leaf...
         for (typename VIdxLeafT::ValueOnCIter it = idxLeaf.cbeginValueOn(); it; ++it) {
-            assert(it.getValue() > -1);
+            OPENVDB_ASSERT(it.getValue() > -1);
 
             const Coord ijk = it.getCoord();
             const math::pcg::SizeType rowNum = static_cast<math::pcg::SizeType>(it.getValue());
diff --git a/openvdb/openvdb/tools/RayIntersector.h b/openvdb/openvdb/tools/RayIntersector.h
index e6063eb7e2..d9b2d3a31e 100644
--- a/openvdb/openvdb/tools/RayIntersector.h
+++ b/openvdb/openvdb/tools/RayIntersector.h
@@ -38,6 +38,7 @@
 #include <openvdb/math/Stencils.h>
 #include <openvdb/Grid.h>
 #include <openvdb/Types.h>
+#include <openvdb/util/Assert.h>
 #include "Morphology.h"
 #include <iostream>
 #include <type_traits>
@@ -642,7 +643,7 @@ class LinearSearchImpl
 
     inline RealT interpTime()
     {
-        assert( math::isApproxLarger(mT[1], mT[0], RealT(1e-6) ) );
+        OPENVDB_ASSERT( math::isApproxLarger(mT[1], mT[0], RealT(1e-6) ) );
         return mT[0]+(mT[1]-mT[0])*mV[0]/(mV[0]-mV[1]);
     }
 
diff --git a/openvdb/openvdb/tools/RayTracer.h b/openvdb/openvdb/tools/RayTracer.h
index dc6092d23d..659270176b 100644
--- a/openvdb/openvdb/tools/RayTracer.h
+++ b/openvdb/openvdb/tools/RayTracer.h
@@ -24,6 +24,7 @@
 #include <openvdb/math/Math.h>
 #include <openvdb/tools/RayIntersector.h>
 #include <openvdb/tools/Interpolation.h>
+#include <openvdb/util/Assert.h>
 #include <openvdb/openvdb.h>
 #include <deque>
 #include <iostream>
@@ -273,15 +274,15 @@ class Film
 
     const RGBA& pixel(size_t w, size_t h) const
     {
-        assert(w < mWidth);
-        assert(h < mHeight);
+        OPENVDB_ASSERT(w < mWidth);
+        OPENVDB_ASSERT(h < mHeight);
         return mPixels[w + h*mWidth];
     }
 
     RGBA& pixel(size_t w, size_t h)
     {
-        assert(w < mWidth);
-        assert(h < mHeight);
+        OPENVDB_ASSERT(w < mWidth);
+        OPENVDB_ASSERT(h < mHeight);
         return mPixels[w + h*mWidth];
     }
 
@@ -356,7 +357,7 @@ class BaseCamera
         , mScaleWidth(frameWidth)
         , mScaleHeight(frameWidth * double(film.height()) / double(film.width()))
     {
-        assert(nearPlane > 0 && farPlane > nearPlane);
+        OPENVDB_ASSERT(nearPlane > 0 && farPlane > nearPlane);
         mScreenToWorld.accumPostRotation(math::X_AXIS, rotation[0] * math::pi<double>() / 180.0);
         mScreenToWorld.accumPostRotation(math::Y_AXIS, rotation[1] * math::pi<double>() / 180.0);
         mScreenToWorld.accumPostRotation(math::Z_AXIS, rotation[2] * math::pi<double>() / 180.0);
@@ -832,7 +833,7 @@ template<typename GridT, typename IntersectorT>
 inline void LevelSetRayTracer<GridT, IntersectorT>::
 setGrid(const GridT& grid)
 {
-    assert(mIsMaster);
+    OPENVDB_ASSERT(mIsMaster);
     mInter = IntersectorT(grid);
 }
 
@@ -840,7 +841,7 @@ template<typename GridT, typename IntersectorT>
 inline void LevelSetRayTracer<GridT, IntersectorT>::
 setIntersector(const IntersectorT& inter)
 {
-    assert(mIsMaster);
+    OPENVDB_ASSERT(mIsMaster);
     mInter = inter;
 }
 
@@ -848,7 +849,7 @@ template<typename GridT, typename IntersectorT>
 inline void LevelSetRayTracer<GridT, IntersectorT>::
 setShader(const BaseShader& shader)
 {
-    assert(mIsMaster);
+    OPENVDB_ASSERT(mIsMaster);
     mShader.reset(shader.copy());
 }
 
@@ -856,7 +857,7 @@ template<typename GridT, typename IntersectorT>
 inline void LevelSetRayTracer<GridT, IntersectorT>::
 setCamera(BaseCamera& camera)
 {
-    assert(mIsMaster);
+    OPENVDB_ASSERT(mIsMaster);
     mCamera = &camera;
 }
 
@@ -864,7 +865,7 @@ template<typename GridT, typename IntersectorT>
 inline void LevelSetRayTracer<GridT, IntersectorT>::
 setPixelSamples(size_t pixelSamples, unsigned int seed)
 {
-    assert(mIsMaster);
+    OPENVDB_ASSERT(mIsMaster);
     if (pixelSamples == 0) {
         OPENVDB_THROW(ValueError, "pixelSamples must be larger than zero!");
     }
diff --git a/openvdb/openvdb/tools/VolumeAdvect.h b/openvdb/openvdb/tools/VolumeAdvect.h
index 7174b7abf5..de4593fb29 100644
--- a/openvdb/openvdb/tools/VolumeAdvect.h
+++ b/openvdb/openvdb/tools/VolumeAdvect.h
@@ -16,6 +16,7 @@
 #include <openvdb/Types.h>
 #include <openvdb/math/Math.h>
 #include <openvdb/util/NullInterrupter.h>
+#include <openvdb/util/Assert.h>
 #include <openvdb/thread/Threading.h>
 #include "Interpolation.h"// for Sampler
 #include "VelocityFields.h" // for VelocityIntegrator
@@ -388,7 +389,7 @@ struct VolumeAdvection<VelocityGridT, StaggeredVelocity, InterrupterType>::Advec
     }
     void operator()(const LeafRangeT& range) const
     {
-        assert(mTask);
+        OPENVDB_ASSERT(mTask);
         mTask(const_cast<Advect*>(this), range);
     }
     void cook(VolumeGridT& outGrid, double time_step)
@@ -432,7 +433,7 @@ struct VolumeAdvection<VelocityGridT, StaggeredVelocity, InterrupterType>::Advec
     void mac(const LeafRangeT& range) const
     {
         if (mParent->interrupt()) return;
-        assert( mParent->mIntegrator == Scheme::MAC );
+        OPENVDB_ASSERT( mParent->mIntegrator == Scheme::MAC );
         AccT acc = mInGrid->getAccessor();
         for (typename LeafRangeT::Iterator leafIter = range.begin(); leafIter; ++leafIter) {
             ValueT* out0 = leafIter.buffer( 0 ).data();// forward
@@ -456,7 +457,7 @@ struct VolumeAdvection<VelocityGridT, StaggeredVelocity, InterrupterType>::Advec
     void bfecc(const LeafRangeT& range) const
     {
         if (mParent->interrupt()) return;
-        assert( mParent->mIntegrator == Scheme::BFECC );
+        OPENVDB_ASSERT( mParent->mIntegrator == Scheme::BFECC );
         AccT acc = mInGrid->getAccessor();
         for (typename LeafRangeT::Iterator leafIter = range.begin(); leafIter; ++leafIter) {
             ValueT* out0 = leafIter.buffer( 0 ).data();// forward
@@ -507,7 +508,7 @@ struct VolumeAdvection<VelocityGridT, StaggeredVelocity, InterrupterType>::Advec
                 ValueT& value = phi[voxelIter.pos()];
 
                 if ( doLimiter ) {
-                    assert(OrderRK == 1);
+                    OPENVDB_ASSERT(OrderRK == 1);
                     Vec3d wPos = xform.indexToWorld(voxelIter.getCoord());
                     mVelocityInt.template rungeKutta<1, Vec3d>(dt, wPos);// Explicit Euler
                     Vec3d iPos = xform.worldToIndex(wPos);
diff --git a/openvdb/openvdb/tools/VolumeToMesh.h b/openvdb/openvdb/tools/VolumeToMesh.h
index a3cee686e4..603ecc259d 100644
--- a/openvdb/openvdb/tools/VolumeToMesh.h
+++ b/openvdb/openvdb/tools/VolumeToMesh.h
@@ -14,6 +14,7 @@
 #include <openvdb/math/Operators.h> // for ISGradient
 #include <openvdb/tree/ValueAccessor.h>
 #include <openvdb/util/Util.h> // for INVALID_IDX
+#include <openvdb/util/Assert.h>
 #include <openvdb/openvdb.h>
 
 #include <tbb/blocked_range.h>
@@ -575,8 +576,8 @@ packPoint(const Vec3d& v)
     uint32_t data = 0;
 
     // values are expected to be in the [0.0 to 1.0] range.
-    assert(!(v.x() > 1.0) && !(v.y() > 1.0) && !(v.z() > 1.0));
-    assert(!(v.x() < 0.0) && !(v.y() < 0.0) && !(v.z() < 0.0));
+    OPENVDB_ASSERT(!(v.x() > 1.0) && !(v.y() > 1.0) && !(v.z() > 1.0));
+    OPENVDB_ASSERT(!(v.x() < 0.0) && !(v.y() < 0.0) && !(v.z() < 0.0));
 
     data |= (uint32_t(v.x() * 1023.0) & MASK_FIRST_10_BITS) << 20;
     data |= (uint32_t(v.y() * 1023.0) & MASK_FIRST_10_BITS) << 10;
@@ -1318,7 +1319,7 @@ computeWeightedPoint(const Vec3d& p,
         samples.push_back(avg);
     }
 
-    assert(!samples.empty());
+    OPENVDB_ASSERT(!samples.empty());
     if (samples.size() == 1) {
         return samples.front();
     }
@@ -1367,7 +1368,7 @@ computeCellPoints(std::array<Vec3d, 4>& points,
 {
     size_t offset = 0;
     for (size_t n = 1, N = sEdgeGroupTable[signs][0] + 1; n < N; ++n, ++offset) {
-        assert(offset < 4);
+        OPENVDB_ASSERT(offset < 4);
         points[offset] = computePoint(values, signs, uint8_t(n), iso);
     }
     return offset;
@@ -1409,7 +1410,7 @@ computeCellPoints(std::array<Vec3d, 4>& points,
     size_t offset = 0;
     for (size_t n = 1, N = sEdgeGroupTable[lhsSigns][0] + 1; n < N; ++n, ++offset)
     {
-        assert(offset < 4);
+        OPENVDB_ASSERT(offset < 4);
         const int id = matchEdgeGroup(uint8_t(n), lhsSigns, rhsSigns);
 
         if (id != -1) {
diff --git a/openvdb/openvdb/tools/VolumeToSpheres.h b/openvdb/openvdb/tools/VolumeToSpheres.h
index edb054df8c..648ec3aaad 100644
--- a/openvdb/openvdb/tools/VolumeToSpheres.h
+++ b/openvdb/openvdb/tools/VolumeToSpheres.h
@@ -549,7 +549,7 @@ class UpdatePoints
     const std::vector<Vec3R>& mPoints;
     std::vector<float>& mDistances;
     std::vector<unsigned char>& mMask;
-    bool mOverlapping;
+    const bool mOverlapping;
     float mRadius;
     int mIndex;
 };
@@ -578,8 +578,8 @@ UpdatePoints::UpdatePoints(UpdatePoints& rhs, tbb::split)
     , mDistances(rhs.mDistances)
     , mMask(rhs.mMask)
     , mOverlapping(rhs.mOverlapping)
-    , mRadius(rhs.mRadius)
-    , mIndex(rhs.mIndex)
+    , mRadius(0.0)
+    , mIndex(0)
 {
 }
 
diff --git a/openvdb/openvdb/tree/InternalNode.h b/openvdb/openvdb/tree/InternalNode.h
index 2d1cf52e7a..7cf653f901 100644
--- a/openvdb/openvdb/tree/InternalNode.h
+++ b/openvdb/openvdb/tree/InternalNode.h
@@ -10,6 +10,7 @@
 
 #include <openvdb/Platform.h>
 #include <openvdb/util/NodeMasks.h>
+#include <openvdb/util/Assert.h>
 #include <openvdb/io/Compression.h> // for io::readCompressedValues(), etc.
 #include <openvdb/math/Math.h> // for math::isExactlyEqual(), etc.
 #include <openvdb/version.h>
@@ -133,7 +134,7 @@ class InternalNode
 
         ChildT& getItem(Index pos) const
         {
-            assert(this->parent().isChildMaskOn(pos));
+            OPENVDB_ASSERT(this->parent().isChildMaskOn(pos));
             return *(this->parent().getChildNode(pos));
         }
 
@@ -1002,7 +1003,7 @@ template<typename ChildT, Index Log2Dim>
 inline void
 InternalNode<ChildT, Log2Dim>::nodeCount(std::vector<Index32> &vec) const
 {
-    assert(vec.size() > ChildNodeType::LEVEL);
+    OPENVDB_ASSERT(vec.size() > ChildNodeType::LEVEL);
     const auto count = mChildMask.countOn();
     if (ChildNodeType::LEVEL > 0 && count > 0) {
         for (auto iter = this->cbeginChildOn(); iter; ++iter) iter->nodeCount(vec);
@@ -1297,7 +1298,7 @@ template<typename ChildT, Index Log2Dim>
 inline void
 InternalNode<ChildT, Log2Dim>::addLeaf(LeafNodeType* leaf)
 {
-    assert(leaf != nullptr);
+    OPENVDB_ASSERT(leaf != nullptr);
     const Coord& xyz = leaf->origin();
     const Index n = this->coordToOffset(xyz);
     ChildT* child = nullptr;
@@ -1326,7 +1327,7 @@ template<typename AccessorT>
 inline void
 InternalNode<ChildT, Log2Dim>::addLeafAndCache(LeafNodeType* leaf, AccessorT& acc)
 {
-    assert(leaf != nullptr);
+    OPENVDB_ASSERT(leaf != nullptr);
     const Coord& xyz = leaf->origin();
     const Index n = this->coordToOffset(xyz);
     ChildT* child = nullptr;
@@ -1359,7 +1360,7 @@ template<typename ChildT, Index Log2Dim>
 inline bool
 InternalNode<ChildT, Log2Dim>::addChild(ChildT* child)
 {
-    assert(child);
+    OPENVDB_ASSERT(child);
     const Coord& xyz = child->origin();
     // verify that the child belongs in this internal node
     if (Coord((xyz & ~(DIM-1))) != this->origin())  return false;
@@ -1375,7 +1376,7 @@ template<typename ChildT, Index Log2Dim>
 inline void
 InternalNode<ChildT, Log2Dim>::addTile(Index n, const ValueType& value, bool state)
 {
-    assert(n < NUM_VALUES);
+    OPENVDB_ASSERT(n < NUM_VALUES);
     this->makeChildNodeEmpty(n, value);
     mValueMask.set(n, state);
 }
@@ -2231,7 +2232,7 @@ InternalNode<ChildT, Log2Dim>::readTopology(std::istream& is, bool fromHalf)
                 for (ValueAllIter iter = this->beginValueAll(); iter; ++iter) {
                     mNodes[iter.pos()].setValue(values[n++]);
                 }
-                assert(n == numValues);
+                OPENVDB_ASSERT(n == numValues);
             } else {
                 for (ValueAllIter iter = this->beginValueAll(); iter; ++iter) {
                     mNodes[iter.pos()].setValue(values[iter.pos()]);
@@ -2494,7 +2495,7 @@ struct InternalNode<ChildT, Log2Dim>::TopologyUnion
 
         A op;
         t->mValueMask.foreach(s->mValueMask, t->mChildMask, op);
-        assert((t->mValueMask & t->mChildMask).isOff());//no overlapping active tiles or child nodes
+        OPENVDB_ASSERT((t->mValueMask & t->mChildMask).isOff());//no overlapping active tiles or child nodes
     }
     void operator()(const tbb::blocked_range<Index> &r) const {
         for (Index i = r.begin(), end=r.end(); i!=end; ++i) {
@@ -2545,7 +2546,7 @@ struct InternalNode<ChildT, Log2Dim>::TopologyIntersection
         t->mChildMask.foreach(s->mChildMask, s->mValueMask, t->mValueMask, op);
 
         t->mValueMask &= s->mValueMask;
-        assert((t->mValueMask & t->mChildMask).isOff());//no overlapping active tiles or child nodes
+        OPENVDB_ASSERT((t->mValueMask & t->mChildMask).isOff());//no overlapping active tiles or child nodes
     }
     void operator()(const tbb::blocked_range<Index> &r) const {
         for (Index i = r.begin(), end=r.end(); i!=end; ++i) {
@@ -2600,7 +2601,7 @@ struct InternalNode<ChildT, Log2Dim>::TopologyDifference
 
         B op2;
         t->mValueMask.foreach(t->mChildMask, s->mValueMask, oldChildMask, op2);
-        assert((t->mValueMask & t->mChildMask).isOff());//no overlapping active tiles or child nodes
+        OPENVDB_ASSERT((t->mValueMask & t->mChildMask).isOff());//no overlapping active tiles or child nodes
     }
     void operator()(const tbb::blocked_range<Index> &r) const {
         for (Index i = r.begin(), end=r.end(); i!=end; ++i) {
@@ -2663,14 +2664,14 @@ InternalNode<ChildT, Log2Dim>::combine(InternalNode& other, CombineOp& op)
         } else if (this->isChildMaskOn(i) && other.isChildMaskOff(i)) {
             // Combine this node's child with the other node's constant value.
             ChildNodeType* child = mNodes[i].getChild();
-            assert(child);
+            OPENVDB_ASSERT(child);
             if (child) {
                 child->combine(other.mNodes[i].getValue(), other.isValueMaskOn(i), op);
             }
         } else if (this->isChildMaskOff(i) && other.isChildMaskOn(i)) {
             // Combine this node's constant value with the other node's child.
             ChildNodeType* child = other.mNodes[i].getChild();
-            assert(child);
+            OPENVDB_ASSERT(child);
             if (child) {
                 // Combine this node's constant value with the other node's child,
                 // but use a new functor in which the A and B values are swapped,
@@ -2689,8 +2690,8 @@ InternalNode<ChildT, Log2Dim>::combine(InternalNode& other, CombineOp& op)
             ChildNodeType
                 *child = mNodes[i].getChild(),
                 *otherChild = other.mNodes[i].getChild();
-            assert(child);
-            assert(otherChild);
+            OPENVDB_ASSERT(child);
+            OPENVDB_ASSERT(otherChild);
             if (child && otherChild) {
                 child->combine(*otherChild, op);
             }
@@ -2718,7 +2719,7 @@ InternalNode<ChildT, Log2Dim>::combine(const ValueType& value, bool valueIsActiv
         } else /*if (isChildMaskOn(i))*/ {
             // Combine this node's child with the given constant value.
             ChildNodeType* child = mNodes[i].getChild();
-            assert(child);
+            OPENVDB_ASSERT(child);
             if (child) child->combine(value, valueIsActive, op);
         }
     }
@@ -2794,7 +2795,7 @@ InternalNode<ChildT, Log2Dim>::combine2(const ValueType& value, const OtherNodeT
             mValueMask.set(i, args.resultIsActive());
         } else {
             typename OtherNodeType::ChildNodeType* otherChild = other.mNodes[i].getChild();
-            assert(otherChild);
+            OPENVDB_ASSERT(otherChild);
             if (this->isChildMaskOff(i)) {
                 // Add a new child with the same coordinates, etc.
                 // as the other node's child.
@@ -2827,7 +2828,7 @@ InternalNode<ChildT, Log2Dim>::combine2(const InternalNode& other, const OtherVa
             mValueMask.set(i, args.resultIsActive());
         } else {
             ChildNodeType* otherChild = other.mNodes[i].getChild();
-            assert(otherChild);
+            OPENVDB_ASSERT(otherChild);
             if (this->isChildMaskOff(i)) {
                 // Add a new child with the same coordinates, etc. as the other node's child.
                 this->setChildNode(i,
@@ -2902,7 +2903,7 @@ template<typename ChildT, Index Log2Dim>
 inline void
 InternalNode<ChildT, Log2Dim>::offsetToLocalCoord(Index n, Coord &xyz)
 {
-    assert(n<(1<<3*Log2Dim));
+    OPENVDB_ASSERT(n<(1<<3*Log2Dim));
     xyz.setX(n >> 2*Log2Dim);
     n &= ((1<<2*Log2Dim)-1);
     xyz.setY(n >> Log2Dim);
@@ -3044,7 +3045,7 @@ template<typename ChildT, Index Log2Dim>
 inline void
 InternalNode<ChildT, Log2Dim>::resetChildNode(Index i, ChildNodeType* child)
 {
-    assert(child);
+    OPENVDB_ASSERT(child);
     if (this->isChildMaskOn(i)) {
         delete mNodes[i].getChild();
     } else {
@@ -3058,8 +3059,8 @@ template<typename ChildT, Index Log2Dim>
 inline void
 InternalNode<ChildT, Log2Dim>::setChildNode(Index i, ChildNodeType* child)
 {
-    assert(child);
-    assert(mChildMask.isOff(i));
+    OPENVDB_ASSERT(child);
+    OPENVDB_ASSERT(mChildMask.isOff(i));
     mChildMask.setOn(i);
     mValueMask.setOff(i);
     mNodes[i].setChild(child);
@@ -3092,7 +3093,7 @@ template<typename ChildT, Index Log2Dim>
 inline ChildT*
 InternalNode<ChildT, Log2Dim>::getChildNode(Index n)
 {
-    assert(this->isChildMaskOn(n));
+    OPENVDB_ASSERT(this->isChildMaskOn(n));
     return mNodes[n].getChild();
 }
 
@@ -3101,7 +3102,7 @@ template<typename ChildT, Index Log2Dim>
 inline const ChildT*
 InternalNode<ChildT, Log2Dim>::getChildNode(Index n) const
 {
-    assert(this->isChildMaskOn(n));
+    OPENVDB_ASSERT(this->isChildMaskOn(n));
     return mNodes[n].getChild();
 }
 
diff --git a/openvdb/openvdb/tree/LeafBuffer.h b/openvdb/openvdb/tree/LeafBuffer.h
index ce33101685..391a9fc1ad 100644
--- a/openvdb/openvdb/tree/LeafBuffer.h
+++ b/openvdb/openvdb/tree/LeafBuffer.h
@@ -7,6 +7,7 @@
 #include <openvdb/Types.h>
 #include <openvdb/io/Compression.h> // for io::readCompressedValues(), etc
 #include <openvdb/util/NodeMasks.h>
+#include <openvdb/util/Assert.h>
 #include <tbb/spin_mutex.h>
 #include <algorithm> // for std::swap
 #include <atomic>
@@ -231,7 +232,7 @@ template<typename T, Index Log2Dim>
 inline void
 LeafBuffer<T, Log2Dim>::setValue(Index i, const ValueType& val)
 {
-    assert(i < SIZE);
+    OPENVDB_ASSERT(i < SIZE);
     this->loadValues();
     if (mData) mData[i] = val;
 }
@@ -378,7 +379,7 @@ inline const typename LeafBuffer<T, Log2Dim>::ValueType&
 LeafBuffer<T, Log2Dim>::at(Index i) const
 {
     static const ValueType sZero = zeroVal<T>();
-    assert(i < SIZE);
+    OPENVDB_ASSERT(i < SIZE);
     this->loadValues();
     // We can't use the ternary operator here, otherwise Visual C++ returns
     // a reference to a temporary.
@@ -418,9 +419,9 @@ LeafBuffer<T, Log2Dim>::doLoad() const
     if (!this->isOutOfCore()) return;
 
     std::unique_ptr<FileInfo> info(self->mFileInfo);
-    assert(info.get() != nullptr);
-    assert(info->mapping.get() != nullptr);
-    assert(info->meta.get() != nullptr);
+    OPENVDB_ASSERT(info.get() != nullptr);
+    OPENVDB_ASSERT(info->mapping.get() != nullptr);
+    OPENVDB_ASSERT(info->meta.get() != nullptr);
 
     /// @todo For now, we have to clear the mData pointer in order for allocate() to take effect.
     self->mData = nullptr;
@@ -488,7 +489,7 @@ class LeafBuffer<bool, Log2Dim>
 
     const bool& getValue(Index i) const
     {
-        assert(i < SIZE);
+        OPENVDB_ASSERT(i < SIZE);
         // We can't use the ternary operator here, otherwise Visual C++ returns
         // a reference to a temporary.
         if (mData.isOn(i)) return sOn; else return sOff;
@@ -498,7 +499,7 @@ class LeafBuffer<bool, Log2Dim>
     bool operator==(const LeafBuffer& other) const { return mData == other.mData; }
     bool operator!=(const LeafBuffer& other) const { return mData != other.mData; }
 
-    void setValue(Index i, bool val) { assert(i < SIZE); mData.set(i, val); }
+    void setValue(Index i, bool val) { OPENVDB_ASSERT(i < SIZE); mData.set(i, val); }
 
     void swap(LeafBuffer& other) { if (&other != this) std::swap(mData, other.mData); }
 
diff --git a/openvdb/openvdb/tree/LeafManager.h b/openvdb/openvdb/tree/LeafManager.h
index bacb0557d9..0f6293d443 100644
--- a/openvdb/openvdb/tree/LeafManager.h
+++ b/openvdb/openvdb/tree/LeafManager.h
@@ -16,6 +16,7 @@
 #define OPENVDB_TREE_LEAFMANAGER_HAS_BEEN_INCLUDED
 
 #include <openvdb/Types.h>
+#include <openvdb/util/Assert.h>
 #include "RootNode.h" // for NodeChain
 #include <tbb/blocked_range.h>
 #include <tbb/parallel_for.h>
@@ -106,7 +107,7 @@ class LeafManager
         public:
             Iterator(const LeafRange& range, size_t pos): mRange(range), mPos(pos)
             {
-                assert(this->isValid());
+                OPENVDB_ASSERT(this->isValid());
             }
             Iterator(const Iterator&) = default;
             Iterator& operator=(const Iterator&) = default;
@@ -180,7 +181,7 @@ class LeafManager
 
         static size_t doSplit(LeafRange& r)
         {
-            assert(r.is_divisible());
+            OPENVDB_ASSERT(r.is_divisible());
             size_t middle = r.mBegin + (r.mEnd - r.mBegin) / 2u;
             r.mEnd = middle;
             return middle;
@@ -315,7 +316,7 @@ class LeafManager
 
     /// @brief Return a pointer to the leaf node at index @a leafIdx in the array.
     /// @note For performance reasons no range check is performed (other than an assertion)!
-    LeafType& leaf(size_t leafIdx) const { assert(leafIdx<mLeafCount); return *mLeafs[leafIdx]; }
+    LeafType& leaf(size_t leafIdx) const { OPENVDB_ASSERT(leafIdx<mLeafCount); return *mLeafs[leafIdx]; }
 
     /// @brief Return the leaf or auxiliary buffer for the leaf node at index @a leafIdx.
     /// If @a bufferIdx is zero, return the leaf buffer, otherwise return the nth
@@ -329,8 +330,8 @@ class LeafManager
     /// but it is not safe to modify the leaf buffer (@a bufferIdx = 0).
     BufferType& getBuffer(size_t leafIdx, size_t bufferIdx) const
     {
-        assert(leafIdx < mLeafCount);
-        assert(bufferIdx == 0 || bufferIdx - 1 < mAuxBuffersPerLeaf);
+        OPENVDB_ASSERT(leafIdx < mLeafCount);
+        OPENVDB_ASSERT(bufferIdx == 0 || bufferIdx - 1 < mAuxBuffersPerLeaf);
         return bufferIdx == 0 ? mLeafs[leafIdx]->buffer()
              : mAuxBuffers[leafIdx * mAuxBuffersPerLeaf + bufferIdx - 1];
     }
diff --git a/openvdb/openvdb/tree/LeafNode.h b/openvdb/openvdb/tree/LeafNode.h
index 33ec3a56ae..93f7927afd 100644
--- a/openvdb/openvdb/tree/LeafNode.h
+++ b/openvdb/openvdb/tree/LeafNode.h
@@ -6,6 +6,7 @@
 
 #include <openvdb/Types.h>
 #include <openvdb/util/NodeMasks.h>
+#include <openvdb/util/Assert.h>
 #include <openvdb/io/Compression.h> // for io::readData(), etc.
 #include "Iterator.h"
 #include "LeafBuffer.h"
@@ -396,7 +397,7 @@ class LeafNode
     /// Set the active state of the voxel at the given coordinates but don't change its value.
     void setActiveState(const Coord& xyz, bool on);
     /// Set the active state of the voxel at the given offset but don't change its value.
-    void setActiveState(Index offset, bool on) { assert(offset<SIZE); mValueMask.set(offset, on); }
+    void setActiveState(Index offset, bool on) { OPENVDB_ASSERT(offset<SIZE); mValueMask.set(offset, on); }
 
     /// Set the value of the voxel at the given coordinates but don't change its active state.
     void setValueOnly(const Coord& xyz, const ValueType& val);
@@ -406,7 +407,7 @@ class LeafNode
     /// Mark the voxel at the given coordinates as inactive but don't change its value.
     void setValueOff(const Coord& xyz) { mValueMask.setOff(LeafNode::coordToOffset(xyz)); }
     /// Mark the voxel at the given offset as inactive but don't change its value.
-    void setValueOff(Index offset) { assert(offset < SIZE); mValueMask.setOff(offset); }
+    void setValueOff(Index offset) { OPENVDB_ASSERT(offset < SIZE); mValueMask.setOff(offset); }
 
     /// Set the value of the voxel at the given coordinates and mark the voxel as inactive.
     void setValueOff(const Coord& xyz, const ValueType& val);
@@ -416,7 +417,7 @@ class LeafNode
     /// Mark the voxel at the given coordinates as active but don't change its value.
     void setValueOn(const Coord& xyz) { mValueMask.setOn(LeafNode::coordToOffset(xyz)); }
     /// Mark the voxel at the given offset as active but don't change its value.
-    void setValueOn(Index offset) { assert(offset < SIZE); mValueMask.setOn(offset); }
+    void setValueOn(Index offset) { OPENVDB_ASSERT(offset < SIZE); mValueMask.setOn(offset); }
     /// Set the value of the voxel at the given coordinates and mark the voxel as active.
     void setValueOn(const Coord& xyz, const ValueType& val) {
         this->setValueOn(LeafNode::coordToOffset(xyz), val);
@@ -1009,7 +1010,7 @@ template<typename T, Index Log2Dim>
 inline Index
 LeafNode<T, Log2Dim>::coordToOffset(const Coord& xyz)
 {
-    assert ((xyz[0] & (DIM-1u)) < DIM && (xyz[1] & (DIM-1u)) < DIM && (xyz[2] & (DIM-1u)) < DIM);
+    OPENVDB_ASSERT((xyz[0] & (DIM-1u)) < DIM && (xyz[1] & (DIM-1u)) < DIM && (xyz[2] & (DIM-1u)) < DIM);
     return ((xyz[0] & (DIM-1u)) << 2*Log2Dim)
         +  ((xyz[1] & (DIM-1u)) <<  Log2Dim)
         +   (xyz[2] & (DIM-1u));
@@ -1019,7 +1020,7 @@ template<typename T, Index Log2Dim>
 inline Coord
 LeafNode<T, Log2Dim>::offsetToLocalCoord(Index n)
 {
-    assert(n<(1<< 3*Log2Dim));
+    OPENVDB_ASSERT(n<(1<< 3*Log2Dim));
     Coord xyz;
     xyz.setX(n >> 2*Log2Dim);
     n &= ((1<<2*Log2Dim)-1);
@@ -1051,7 +1052,7 @@ template<typename ValueT, Index Log2Dim>
 inline const ValueT&
 LeafNode<ValueT, Log2Dim>::getValue(Index offset) const
 {
-    assert(offset < SIZE);
+    OPENVDB_ASSERT(offset < SIZE);
     return mBuffer[offset];
 }
 
@@ -1067,7 +1068,7 @@ template<typename T, Index Log2Dim>
 inline bool
 LeafNode<T, Log2Dim>::probeValue(Index offset, ValueType& val) const
 {
-    assert(offset < SIZE);
+    OPENVDB_ASSERT(offset < SIZE);
     val = mBuffer[offset];
     return mValueMask.isOn(offset);
 }
@@ -1084,7 +1085,7 @@ template<typename T, Index Log2Dim>
 inline void
 LeafNode<T, Log2Dim>::setValueOff(Index offset, const ValueType& val)
 {
-    assert(offset < SIZE);
+    OPENVDB_ASSERT(offset < SIZE);
     mBuffer.setValue(offset, val);
     mValueMask.setOff(offset);
 }
@@ -1109,7 +1110,7 @@ template<typename T, Index Log2Dim>
 inline void
 LeafNode<T, Log2Dim>::setValueOnly(Index offset, const ValueType& val)
 {
-    assert(offset<SIZE); mBuffer.setValue(offset, val);
+    OPENVDB_ASSERT(offset<SIZE); mBuffer.setValue(offset, val);
 }
 
 
@@ -1466,7 +1467,7 @@ template<typename OtherType, Index OtherLog2Dim>
 inline bool
 LeafNode<T, Log2Dim>::hasSameTopology(const LeafNode<OtherType, OtherLog2Dim>* other) const
 {
-    assert(other);
+    OPENVDB_ASSERT(other);
     return (Log2Dim == OtherLog2Dim && mValueMask == other->getValueMask());
 }
 
@@ -1586,7 +1587,7 @@ template<typename T, Index Log2Dim>
 inline void
 LeafNode<T, Log2Dim>::addTile(Index offset, const ValueType& val, bool active)
 {
-    assert(offset < SIZE);
+    OPENVDB_ASSERT(offset < SIZE);
     setValueOnly(offset, val);
     setActiveState(offset, active);
 }
@@ -1610,6 +1611,7 @@ LeafNode<T, Log2Dim>::resetBackground(const ValueType& oldBackground,
                                       const ValueType& newBackground)
 {
     if (!this->allocate()) return;
+    if (math::isExactlyEqual(oldBackground, newBackground)) return;
 
     typename NodeMaskType::OffIterator iter;
     // For all inactive values...
diff --git a/openvdb/openvdb/tree/LeafNodeBool.h b/openvdb/openvdb/tree/LeafNodeBool.h
index 02f3cb4c99..a5290dbbbc 100644
--- a/openvdb/openvdb/tree/LeafNodeBool.h
+++ b/openvdb/openvdb/tree/LeafNodeBool.h
@@ -6,6 +6,7 @@
 
 #include <openvdb/Types.h>
 #include <openvdb/io/Compression.h> // for io::readData(), etc.
+#include <openvdb/util/Assert.h>
 #include <openvdb/math/Math.h> // for math::isZero()
 #include <openvdb/util/NodeMasks.h>
 #include "LeafNode.h"
@@ -246,17 +247,17 @@ class LeafNode<bool, Log2Dim>
     /// Set the active state of the voxel at the given coordinates but don't change its value.
     void setActiveState(const Coord& xyz, bool on);
     /// Set the active state of the voxel at the given offset but don't change its value.
-    void setActiveState(Index offset, bool on) { assert(offset<SIZE); mValueMask.set(offset, on); }
+    void setActiveState(Index offset, bool on) { OPENVDB_ASSERT(offset<SIZE); mValueMask.set(offset, on); }
 
     /// Set the value of the voxel at the given coordinates but don't change its active state.
     void setValueOnly(const Coord& xyz, bool val);
     /// Set the value of the voxel at the given offset but don't change its active state.
-    void setValueOnly(Index offset, bool val) { assert(offset<SIZE); mBuffer.setValue(offset,val); }
+    void setValueOnly(Index offset, bool val) { OPENVDB_ASSERT(offset<SIZE); mBuffer.setValue(offset,val); }
 
     /// Mark the voxel at the given coordinates as inactive but don't change its value.
     void setValueOff(const Coord& xyz) { mValueMask.setOff(this->coordToOffset(xyz)); }
     /// Mark the voxel at the given offset as inactive but don't change its value.
-    void setValueOff(Index offset) { assert(offset < SIZE); mValueMask.setOff(offset); }
+    void setValueOff(Index offset) { OPENVDB_ASSERT(offset < SIZE); mValueMask.setOff(offset); }
 
     /// Set the value of the voxel at the given coordinates and mark the voxel as inactive.
     void setValueOff(const Coord& xyz, bool val);
@@ -266,7 +267,7 @@ class LeafNode<bool, Log2Dim>
     /// Mark the voxel at the given coordinates as active but don't change its value.
     void setValueOn(const Coord& xyz) { mValueMask.setOn(this->coordToOffset(xyz)); }
     /// Mark the voxel at the given offset as active but don't change its value.
-    void setValueOn(Index offset) { assert(offset < SIZE); mValueMask.setOn(offset); }
+    void setValueOn(Index offset) { OPENVDB_ASSERT(offset < SIZE); mValueMask.setOn(offset); }
 
     /// Set the value of the voxel at the given coordinates and mark the voxel as active.
     void setValueOn(const Coord& xyz, bool val);
@@ -296,7 +297,7 @@ class LeafNode<bool, Log2Dim>
     /// Return @c true if the voxel at the given coordinates is active.
     bool isValueOn(const Coord& xyz) const { return mValueMask.isOn(this->coordToOffset(xyz)); }
     /// Return @c true if the voxel at the given offset is active.
-    bool isValueOn(Index offset) const { assert(offset < SIZE); return mValueMask.isOn(offset); }
+    bool isValueOn(Index offset) const { OPENVDB_ASSERT(offset < SIZE); return mValueMask.isOn(offset); }
 
     /// Return @c false since leaf nodes never contain tiles.
     static bool hasActiveTiles() { return false; }
@@ -922,7 +923,7 @@ template<typename OtherType, Index OtherLog2Dim>
 inline bool
 LeafNode<bool, Log2Dim>::hasSameTopology(const LeafNode<OtherType, OtherLog2Dim>* other) const
 {
-    assert(other);
+    OPENVDB_ASSERT(other);
     return (Log2Dim == OtherLog2Dim && mValueMask == other->getValueMask());
 }
 
@@ -945,7 +946,7 @@ template<Index Log2Dim>
 inline Index
 LeafNode<bool, Log2Dim>::coordToOffset(const Coord& xyz)
 {
-    assert ((xyz[0] & (DIM-1u)) < DIM && (xyz[1] & (DIM-1u)) < DIM && (xyz[2] & (DIM-1u)) < DIM);
+    OPENVDB_ASSERT((xyz[0] & (DIM-1u)) < DIM && (xyz[1] & (DIM-1u)) < DIM && (xyz[2] & (DIM-1u)) < DIM);
     return ((xyz[0] & (DIM-1u)) << 2*Log2Dim)
          + ((xyz[1] & (DIM-1u)) << Log2Dim)
          +  (xyz[2] & (DIM-1u));
@@ -956,7 +957,7 @@ template<Index Log2Dim>
 inline Coord
 LeafNode<bool, Log2Dim>::offsetToLocalCoord(Index n)
 {
-    assert(n < (1 << 3*Log2Dim));
+    OPENVDB_ASSERT(n < (1 << 3*Log2Dim));
     Coord xyz;
     xyz.setX(n >> 2*Log2Dim);
     n &= ((1 << 2*Log2Dim) - 1);
@@ -1146,7 +1147,7 @@ template<Index Log2Dim>
 inline void
 LeafNode<bool, Log2Dim>::addTile(Index offset, bool val, bool active)
 {
-    assert(offset < SIZE);
+    OPENVDB_ASSERT(offset < SIZE);
     this->setValueOnly(offset, val);
     this->setActiveState(offset, active);
 }
@@ -1177,7 +1178,7 @@ template<Index Log2Dim>
 inline const bool&
 LeafNode<bool, Log2Dim>::getValue(Index offset) const
 {
-    assert(offset < SIZE);
+    OPENVDB_ASSERT(offset < SIZE);
     // This *CANNOT* use operator ? for Windows
     if (mBuffer.mData.isOn(offset)) return Buffer::sOn; else return Buffer::sOff;
 }
@@ -1205,7 +1206,7 @@ template<Index Log2Dim>
 inline void
 LeafNode<bool, Log2Dim>::setValueOn(Index offset, bool val)
 {
-    assert(offset < SIZE);
+    OPENVDB_ASSERT(offset < SIZE);
     mValueMask.setOn(offset);
     mBuffer.mData.set(offset, val);
 }
@@ -1239,7 +1240,7 @@ template<Index Log2Dim>
 inline void
 LeafNode<bool, Log2Dim>::setValueOff(Index offset, bool val)
 {
-    assert(offset < SIZE);
+    OPENVDB_ASSERT(offset < SIZE);
     mValueMask.setOff(offset);
     mBuffer.mData.set(offset, val);
 }
diff --git a/openvdb/openvdb/tree/LeafNodeMask.h b/openvdb/openvdb/tree/LeafNodeMask.h
index a8e31ec9e7..d044130ab8 100644
--- a/openvdb/openvdb/tree/LeafNodeMask.h
+++ b/openvdb/openvdb/tree/LeafNodeMask.h
@@ -9,6 +9,7 @@
 #include <openvdb/io/Compression.h> // for io::readData(), etc.
 #include <openvdb/math/Math.h> // for math::isZero()
 #include <openvdb/util/NodeMasks.h>
+#include <openvdb/util/Assert.h>
 #include "LeafNode.h"
 #include "Iterator.h"
 #include <iostream>
@@ -228,17 +229,17 @@ class LeafNode<ValueMask, Log2Dim>
     /// Set the active state of the voxel at the given coordinates but don't change its value.
     void setActiveState(const Coord& xyz, bool on);
     /// Set the active state of the voxel at the given offset but don't change its value.
-    void setActiveState(Index offset, bool on) { assert(offset<SIZE); mBuffer.mData.set(offset, on); }
+    void setActiveState(Index offset, bool on) { OPENVDB_ASSERT(offset<SIZE); mBuffer.mData.set(offset, on); }
 
     /// Set the value of the voxel at the given coordinates but don't change its active state.
     void setValueOnly(const Coord& xyz, bool val);
     /// Set the value of the voxel at the given offset but don't change its active state.
-    void setValueOnly(Index offset, bool val) { assert(offset<SIZE); mBuffer.setValue(offset,val); }
+    void setValueOnly(Index offset, bool val) { OPENVDB_ASSERT(offset<SIZE); mBuffer.setValue(offset,val); }
 
     /// Mark the voxel at the given coordinates as inactive but don't change its value.
     void setValueOff(const Coord& xyz) { mBuffer.mData.setOff(this->coordToOffset(xyz)); }
     /// Mark the voxel at the given offset as inactive but don't change its value.
-    void setValueOff(Index offset) { assert(offset < SIZE); mBuffer.mData.setOff(offset); }
+    void setValueOff(Index offset) { OPENVDB_ASSERT(offset < SIZE); mBuffer.mData.setOff(offset); }
 
     /// Set the value of the voxel at the given coordinates and mark the voxel as inactive.
     void setValueOff(const Coord& xyz, bool val);
@@ -248,7 +249,7 @@ class LeafNode<ValueMask, Log2Dim>
     /// Mark the voxel at the given coordinates as active but don't change its value.
     void setValueOn(const Coord& xyz) { mBuffer.mData.setOn(this->coordToOffset(xyz)); }
     /// Mark the voxel at the given offset as active but don't change its value.
-    void setValueOn(Index offset) { assert(offset < SIZE); mBuffer.mData.setOn(offset); }
+    void setValueOn(Index offset) { OPENVDB_ASSERT(offset < SIZE); mBuffer.mData.setOn(offset); }
 
     /// Set the value of the voxel at the given coordinates and mark the voxel as active.
     void setValueOn(const Coord& xyz, bool val);
@@ -278,7 +279,7 @@ class LeafNode<ValueMask, Log2Dim>
     /// Return @c true if the voxel at the given coordinates is active.
     bool isValueOn(const Coord& xyz) const { return mBuffer.mData.isOn(this->coordToOffset(xyz)); }
     /// Return @c true if the voxel at the given offset is active.
-    bool isValueOn(Index offset) const { assert(offset < SIZE); return mBuffer.mData.isOn(offset); }
+    bool isValueOn(Index offset) const { OPENVDB_ASSERT(offset < SIZE); return mBuffer.mData.isOn(offset); }
 
     /// Return @c false since leaf nodes never contain tiles.
     static bool hasActiveTiles() { return false; }
@@ -880,7 +881,7 @@ template<typename OtherType, Index OtherLog2Dim>
 inline bool
 LeafNode<ValueMask, Log2Dim>::hasSameTopology(const LeafNode<OtherType, OtherLog2Dim>* other) const
 {
-    assert(other);
+    OPENVDB_ASSERT(other);
     return (Log2Dim == OtherLog2Dim && mBuffer.mData == other->getValueMask());
 }
 
@@ -903,7 +904,7 @@ template<Index Log2Dim>
 inline Index
 LeafNode<ValueMask, Log2Dim>::coordToOffset(const Coord& xyz)
 {
-    assert ((xyz[0] & (DIM-1u)) < DIM && (xyz[1] & (DIM-1u)) < DIM && (xyz[2] & (DIM-1u)) < DIM);
+    OPENVDB_ASSERT((xyz[0] & (DIM-1u)) < DIM && (xyz[1] & (DIM-1u)) < DIM && (xyz[2] & (DIM-1u)) < DIM);
     return ((xyz[0] & (DIM-1u)) << 2*Log2Dim)
          + ((xyz[1] & (DIM-1u)) << Log2Dim)
          +  (xyz[2] & (DIM-1u));
@@ -914,7 +915,7 @@ template<Index Log2Dim>
 inline Coord
 LeafNode<ValueMask, Log2Dim>::offsetToLocalCoord(Index n)
 {
-    assert(n < (1 << 3*Log2Dim));
+    OPENVDB_ASSERT(n < (1 << 3*Log2Dim));
     Coord xyz;
     xyz.setX(n >> 2*Log2Dim);
     n &= ((1 << 2*Log2Dim) - 1);
@@ -1067,7 +1068,7 @@ template<Index Log2Dim>
 inline void
 LeafNode<ValueMask, Log2Dim>::addTile(Index offset, bool val, bool active)
 {
-    assert(offset < SIZE);
+    OPENVDB_ASSERT(offset < SIZE);
     this->setValueOnly(offset, val);
     this->setActiveState(offset, active);
 }
@@ -1098,7 +1099,7 @@ template<Index Log2Dim>
 inline const bool&
 LeafNode<ValueMask, Log2Dim>::getValue(Index offset) const
 {
-    assert(offset < SIZE);
+    OPENVDB_ASSERT(offset < SIZE);
     // This *CANNOT* use operator ? for Windows
     if (mBuffer.mData.isOn(offset)) return Buffer::sOn; else return Buffer::sOff;
 }
@@ -1126,7 +1127,7 @@ template<Index Log2Dim>
 inline void
 LeafNode<ValueMask, Log2Dim>::setValueOn(Index offset, bool val)
 {
-    assert(offset < SIZE);
+    OPENVDB_ASSERT(offset < SIZE);
     mBuffer.mData.set(offset, val);
 }
 
@@ -1159,7 +1160,7 @@ template<Index Log2Dim>
 inline void
 LeafNode<ValueMask, Log2Dim>::setValueOff(Index offset, bool val)
 {
-    assert(offset < SIZE);
+    OPENVDB_ASSERT(offset < SIZE);
     mBuffer.mData.set(offset, val);
 }
 
diff --git a/openvdb/openvdb/tree/NodeManager.h b/openvdb/openvdb/tree/NodeManager.h
index 4d0d9b466e..ce483bd7df 100644
--- a/openvdb/openvdb/tree/NodeManager.h
+++ b/openvdb/openvdb/tree/NodeManager.h
@@ -14,6 +14,7 @@
 #define OPENVDB_TREE_NODEMANAGER_HAS_BEEN_INCLUDED
 
 #include <openvdb/Types.h>
+#include <openvdb/util/Assert.h>
 #include <tbb/parallel_for.h>
 #include <tbb/parallel_reduce.h>
 #include <deque>
@@ -56,9 +57,9 @@ class NodeList
 public:
     NodeList() = default;
 
-    NodeT& operator()(size_t n) const { assert(n<mNodeCount); return *(mNodes[n]); }
+    NodeT& operator()(size_t n) const { OPENVDB_ASSERT(n<mNodeCount); return *(mNodes[n]); }
 
-    NodeT*& operator[](size_t n) { assert(n<mNodeCount); return mNodes[n]; }
+    NodeT*& operator[](size_t n) { OPENVDB_ASSERT(n<mNodeCount); return mNodes[n]; }
 
     Index64 nodeCount() const { return mNodeCount; }
 
@@ -210,7 +211,7 @@ class NodeList
         public:
             Iterator(const NodeRange& range, size_t pos): mRange(range), mPos(pos)
             {
-                assert(this->isValid());
+                OPENVDB_ASSERT(this->isValid());
             }
             Iterator(const Iterator&) = default;
             Iterator& operator=(const Iterator&) = default;
@@ -251,7 +252,7 @@ class NodeList
 
         static size_t doSplit(NodeRange& r)
         {
-            assert(r.is_divisible());
+            OPENVDB_ASSERT(r.is_divisible());
             size_t middle = r.mBegin + (r.mEnd - r.mBegin) / 2u;
             r.mEnd = middle;
             return middle;
diff --git a/openvdb/openvdb/tree/RootNode.h b/openvdb/openvdb/tree/RootNode.h
index c9abc2e965..0d182b5864 100644
--- a/openvdb/openvdb/tree/RootNode.h
+++ b/openvdb/openvdb/tree/RootNode.h
@@ -14,6 +14,7 @@
 #include <openvdb/math/Math.h> // for isZero(), isExactlyEqual(), etc.
 #include <openvdb/math/BBox.h>
 #include <openvdb/util/NodeMasks.h> // for backward compatibility only (see readTopology())
+#include <openvdb/util/Assert.h>
 #include <openvdb/version.h>
 #include <tbb/parallel_for.h>
 #include <map>
@@ -224,7 +225,7 @@ class RootNode
             return *mParentNode;
         }
 
-        bool test() const { assert(mParentNode); return mIter != mParentNode->mTable.end(); }
+        bool test() const { OPENVDB_ASSERT(mParentNode); return mIter != mParentNode->mTable.end(); }
         operator bool() const { return this->test(); }
 
         void increment() { if (this->test()) { ++mIter; } this->skip(); }
@@ -301,12 +302,12 @@ class RootNode
         ValueT& operator*() const { return this->getValue(); }
         ValueT* operator->() const { return &(this->getValue()); }
 
-        void setValue(const ValueT& v) const { assert(isTile(mIter)); getTile(mIter).value = v; }
+        void setValue(const ValueT& v) const { OPENVDB_ASSERT(isTile(mIter)); getTile(mIter).value = v; }
 
         template<typename ModifyOp>
         void modifyValue(const ModifyOp& op) const
         {
-            assert(isTile(mIter));
+            OPENVDB_ASSERT(isTile(mIter));
             op(getTile(mIter).value);
         }
     }; // ValueIter
@@ -345,7 +346,7 @@ class RootNode
         bool probeValue(NonConstValueType& value) const { return !this->probeChild(value); }
 
         void setChild(ChildNodeT& c) const { RootNodeT::setChild(mIter, c); }
-        void setChild(ChildNodeT* c) const { assert(c != nullptr); RootNodeT::setChild(mIter, *c); }
+        void setChild(ChildNodeT* c) const { OPENVDB_ASSERT(c != nullptr); RootNodeT::setChild(mIter, *c); }
         void setValue(const ValueT& v) const
         {
             if (isTile(mIter)) getTile(mIter).value = v;
@@ -1677,7 +1678,7 @@ template<typename ChildT>
 inline void
 RootNode<ChildT>::nodeCount(std::vector<Index32> &vec) const
 {
-    assert(vec.size() > LEVEL);
+    OPENVDB_ASSERT(vec.size() > LEVEL);
     Index32 sum = 0;
     for (MapCIter i = mTable.begin(), e = mTable.end(); i != e; ++i) {
         if (isChild(i)) {
diff --git a/openvdb/openvdb/tree/Tree.h b/openvdb/openvdb/tree/Tree.h
index 622adc32bf..44e77c5307 100644
--- a/openvdb/openvdb/tree/Tree.h
+++ b/openvdb/openvdb/tree/Tree.h
@@ -13,6 +13,7 @@
 #include <openvdb/tools/Count.h> // tools::countActiveVoxels(), tools::memUsage(), tools::minMax()
 #include <openvdb/util/Formats.h>
 #include <openvdb/util/logging.h>
+#include <openvdb/util/Assert.h>
 #include <openvdb/Platform.h>
 #include "RootNode.h"
 #include "InternalNode.h"
@@ -515,7 +516,7 @@ class Tree: public TreeBase
     ///
     /// @warning Ownership of the leaf is transferred to the tree so
     /// the client code should not attempt to delete the leaf pointer!
-    void addLeaf(LeafNodeType* leaf) { assert(leaf); mRoot.addLeaf(leaf); }
+    void addLeaf(LeafNodeType* leaf) { OPENVDB_ASSERT(leaf); mRoot.addLeaf(leaf); }
 
     /// @brief Add a tile containing voxel (x, y, z) at the specified tree level,
     /// creating a new branch if necessary.  Delete any existing lower-level nodes
@@ -1971,7 +1972,7 @@ Tree<RootNodeType>::print(std::ostream& os, int verboseLevel) const
 
     const auto nodeCount = this->nodeCount();//fast
     const Index32 leafCount = nodeCount.front();// leaf is the first element
-    assert(dims.size() == nodeCount.size());
+    OPENVDB_ASSERT(dims.size() == nodeCount.size());
 
     Index64 totalNodeCount = 0;
     for (size_t i = 0; i < nodeCount.size(); ++i) totalNodeCount += nodeCount[i];
diff --git a/openvdb/openvdb/tree/TreeIterator.h b/openvdb/openvdb/tree/TreeIterator.h
index 0f81f7399a..c3d1f76780 100644
--- a/openvdb/openvdb/tree/TreeIterator.h
+++ b/openvdb/openvdb/tree/TreeIterator.h
@@ -10,6 +10,7 @@
 #include <tbb/parallel_for.h>
 #include <openvdb/version.h>
 #include <openvdb/Types.h>
+#include <openvdb/util/Assert.h>
 #include <algorithm>
 #include <sstream>
 #include <string>
@@ -585,7 +586,7 @@ class IterListItem<PrevItemT, NodeVecT, /*VecSize=*/1, _Level>
 
     const NCValueT& getValue(Index lvl) const
     {
-        assert(lvl == Level);
+        OPENVDB_ASSERT(lvl == Level);
         (void)lvl; // avoid unused variable warning in optimized builds
         return mIter.getValue();
     }
diff --git a/openvdb/openvdb/tree/ValueAccessor.h b/openvdb/openvdb/tree/ValueAccessor.h
index c2269839b2..bf749e8785 100644
--- a/openvdb/openvdb/tree/ValueAccessor.h
+++ b/openvdb/openvdb/tree/ValueAccessor.h
@@ -47,10 +47,10 @@
 
 #include <openvdb/version.h>
 #include <openvdb/Types.h>
+#include <openvdb/util/Assert.h>
 
 #include <tbb/spin_mutex.h>
 
-#include <cassert>
 #include <limits>
 #include <type_traits>
 #include <mutex>
@@ -198,7 +198,7 @@ class ValueAccessorBase
     TreeType* getTree() const { return mTree; }
 
     /// @brief Return a reference to the tree associated with this accessor.
-    TreeType& tree() const { assert(mTree); return *mTree; }
+    TreeType& tree() const { OPENVDB_ASSERT(mTree); return *mTree; }
 
     /// @brief  Pure virtual method, clears the derived accessor
     virtual void clear() = 0;
@@ -334,8 +334,8 @@ struct ValueAccessorLeafBuffer
     template <typename NodeT>
     static constexpr bool BypassLeafAPI =
         std::is_same<NodeT, typename TreeTypeT::LeafNodeType>::value;
-    inline const typename TreeTypeT::ValueType* buffer() { assert(mBuffer); return mBuffer; }
-    inline const typename TreeTypeT::ValueType* buffer() const { assert(mBuffer); return mBuffer; }
+    inline const typename TreeTypeT::ValueType* buffer() { OPENVDB_ASSERT(mBuffer); return mBuffer; }
+    inline const typename TreeTypeT::ValueType* buffer() const { OPENVDB_ASSERT(mBuffer); return mBuffer; }
     inline void setBuffer(const typename TreeTypeT::ValueType* b) const { mBuffer = b; }
 private:
     mutable const typename TreeTypeT::ValueType* mBuffer;
@@ -350,9 +350,9 @@ struct ValueAccessorLeafBuffer<TreeTypeT, IntegerSequence,
     >::type>
 {
     template <typename> static constexpr bool BypassLeafAPI = false;
-    inline constexpr typename TreeTypeT::ValueType* buffer() { assert(false); return nullptr; }
-    inline constexpr typename TreeTypeT::ValueType* buffer() const { assert(false); return nullptr; }
-    inline constexpr void setBuffer(const typename TreeTypeT::ValueType*) const { assert(false); }
+    inline constexpr typename TreeTypeT::ValueType* buffer() { OPENVDB_ASSERT(false); return nullptr; }
+    inline constexpr typename TreeTypeT::ValueType* buffer() const { OPENVDB_ASSERT(false); return nullptr; }
+    inline constexpr void setBuffer(const typename TreeTypeT::ValueType*) const { OPENVDB_ASSERT(false); }
 };
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -469,7 +469,7 @@ class ValueAccessorImpl final :
                 }
                 else {
                     auto node = mNodes.template get<Idx>();
-                    assert(node);
+                    OPENVDB_ASSERT(node);
                     return &(node->getValueAndCache(xyz, *this));
                 }
             });
@@ -480,7 +480,7 @@ class ValueAccessorImpl final :
     bool isValueOn(const Coord& xyz) const
     {
         return this->evalFirstCached(xyz, [&](const auto node) -> bool {
-                assert(node);
+                OPENVDB_ASSERT(node);
                 return node->isValueOnAndCache(xyz, *this);
             });
     }
@@ -494,7 +494,7 @@ class ValueAccessorImpl final :
         return this->evalFirstCached(xyz, [&](const auto node) -> bool
             {
                 using NodeType = std::remove_pointer_t<decltype(node)>;
-                assert(node);
+                OPENVDB_ASSERT(node);
 
                 if constexpr(IsLeafAndBypassLeafAPI<NodeType>) {
                     const auto offset = LeafNodeT::coordToOffset(xyz);
@@ -518,7 +518,7 @@ class ValueAccessorImpl final :
         return this->evalFirstCached(xyz, [&](const auto node) -> int
             {
                 using NodeType = std::remove_pointer_t<decltype(node)>;
-                assert(node);
+                OPENVDB_ASSERT(node);
 
                 if constexpr(std::is_same<RootNodeT, NodeType>::value) {
                     return node->getValueDepthAndCache(xyz, *this);
@@ -534,7 +534,7 @@ class ValueAccessorImpl final :
     /// @param xyz  The index space coordinate to query
     bool isVoxel(const Coord& xyz) const
     {
-        assert(BaseT::mTree);
+        OPENVDB_ASSERT(BaseT::mTree);
         return this->getValueDepth(xyz) ==
             static_cast<int>(RootNodeT::LEVEL);
     }
@@ -553,7 +553,7 @@ class ValueAccessorImpl final :
         this->evalFirstCached(xyz, [&](const auto node) -> void
             {
                 using NodeType = std::remove_pointer_t<decltype(node)>;
-                assert(node);
+                OPENVDB_ASSERT(node);
 
                 if constexpr(IsLeafAndBypassLeafAPI<NodeType>) {
                     const auto offset = LeafNodeT::coordToOffset(xyz);
@@ -590,7 +590,7 @@ class ValueAccessorImpl final :
                 }
                 else {
                     auto node = mNodes.template get<Idx>();
-                    assert(node);
+                    OPENVDB_ASSERT(node);
                     const_cast<NodeType*>(node)->setValueOnlyAndCache(xyz, value, *this);
                 }
                 return true;
@@ -610,7 +610,7 @@ class ValueAccessorImpl final :
         this->evalFirstCached(xyz, [&](const auto node) -> void
             {
                 using NodeType = std::remove_pointer_t<decltype(node)>;
-                assert(node);
+                OPENVDB_ASSERT(node);
 
                 if constexpr(IsLeafAndBypassLeafAPI<NodeType>) {
                     const auto offset = LeafNodeT::coordToOffset(xyz);
@@ -635,7 +635,7 @@ class ValueAccessorImpl final :
         this->evalFirstCached(xyz, [&](const auto node) -> void
             {
                 using NodeType = std::remove_pointer_t<decltype(node)>;
-                assert(node);
+                OPENVDB_ASSERT(node);
 
                 if constexpr(IsLeafAndBypassLeafAPI<NodeType>) {
                     const auto offset = LeafNodeT::coordToOffset(xyz);
@@ -659,7 +659,7 @@ class ValueAccessorImpl final :
         this->evalFirstCached(xyz, [&](const auto node) -> void
             {
                 using NodeType = std::remove_pointer_t<decltype(node)>;
-                assert(node);
+                OPENVDB_ASSERT(node);
 
                 if constexpr(IsLeafAndBypassLeafAPI<NodeType>) {
                     const auto offset = LeafNodeT::coordToOffset(xyz);
@@ -686,7 +686,7 @@ class ValueAccessorImpl final :
         this->evalFirstCached(xyz, [&](const auto node) -> void
             {
                 using NodeType = std::remove_pointer_t<decltype(node)>;
-                assert(node);
+                OPENVDB_ASSERT(node);
                 const_cast<NodeType*>(node)->setActiveStateAndCache(xyz, on, *this);
             });
     }
@@ -718,7 +718,7 @@ class ValueAccessorImpl final :
         return this->evalFirstCached(xyz, [&](const auto node) -> LeafNodeT*
             {
                 using NodeType = std::remove_pointer_t<decltype(node)>;
-                assert(node);
+                OPENVDB_ASSERT(node);
                 return const_cast<NodeType*>(node)->touchLeafAndCache(xyz, *this);
             });
     }
@@ -731,11 +731,11 @@ class ValueAccessorImpl final :
         constexpr int64_t Start = NodeLevelList::template Index<LeafNodeT> + 1;
         static_assert(!BaseT::IsConstTree, "can't add a node to a const tree");
         static_assert(Start >= 0);
-        assert(leaf);
+        OPENVDB_ASSERT(leaf);
         this->evalFirstCached<Start>(leaf->origin(), [&](const auto node) -> void
             {
                 using NodeType = std::remove_pointer_t<decltype(node)>;
-                assert(node);
+                OPENVDB_ASSERT(node);
                 const_cast<NodeType*>(node)->addLeafAndCache(leaf, *this);
             });
     }
@@ -759,7 +759,7 @@ class ValueAccessorImpl final :
         this->evalFirstCached<Start>(xyz, [&](const auto node) -> void
             {
                 using NodeType = std::remove_pointer_t<decltype(node)>;
-                assert(node);
+                OPENVDB_ASSERT(node);
                 const_cast<NodeType*>(node)->addTileAndCache(level, xyz, value, state, *this);
             });
     }
@@ -790,12 +790,12 @@ class ValueAccessorImpl final :
             [&](const auto node) -> NodeT*
             {
                 using NodeType = std::remove_pointer_t<decltype(node)>;
-                assert(node);
+                OPENVDB_ASSERT(node);
                 if constexpr(std::is_same<NodeT, NodeType>::value) {
                     return const_cast<NodeT*>(node);
                 }
                 else {
-                    assert(NodeT::LEVEL < NodeType::LEVEL);
+                    OPENVDB_ASSERT(NodeT::LEVEL < NodeType::LEVEL);
                     return const_cast<NodeType*>(node)->template probeNodeAndCache<NodeT>(xyz, *this);
                 }
             });
@@ -817,12 +817,12 @@ class ValueAccessorImpl final :
             [&](const auto node) -> const NodeT*
             {
                 using NodeType = std::remove_pointer_t<decltype(node)>;
-                assert(node);
+                OPENVDB_ASSERT(node);
                 if constexpr(std::is_same<NodeT, NodeType>::value) {
                     return node;
                 }
                 else {
-                    assert(NodeT::LEVEL < NodeType::LEVEL);
+                    OPENVDB_ASSERT(NodeT::LEVEL < NodeType::LEVEL);
                     return const_cast<NodeType*>(node)->template probeConstNodeAndCache<NodeT>(xyz, *this);
                 }
             });
@@ -969,7 +969,7 @@ class ValueAccessorImpl final :
     template <typename OpT>
     OPENVDB_FORCE_INLINE auto evalFirstIndex(OpT&& op) const
     {
-        assert(BaseT::mTree);
+        OPENVDB_ASSERT(BaseT::mTree);
         // Mutex lock the accessor. Does nothing if no mutex if in place
         [[maybe_unused]] const auto lock = this->lock();
         // Get the return type of the provided operation OpT
@@ -986,7 +986,7 @@ class ValueAccessorImpl final :
     template <typename PredT, typename OpT>
     OPENVDB_FORCE_INLINE auto evalFirstPred(PredT&& pred, OpT&& op) const
     {
-        assert(BaseT::mTree);
+        OPENVDB_ASSERT(BaseT::mTree);
         // Mutex lock the accessor. Does nothing if no mutex if in place
         [[maybe_unused]] const auto lock = this->lock();
         using RetT = typename std::invoke_result<OpT, RootNodeT*>::type;
diff --git a/openvdb/openvdb/unittest/CMakeLists.txt b/openvdb/openvdb/unittest/CMakeLists.txt
index d9b011f4f3..3fa4b9130b 100644
--- a/openvdb/openvdb/unittest/CMakeLists.txt
+++ b/openvdb/openvdb/unittest/CMakeLists.txt
@@ -221,15 +221,16 @@ endif()
 target_link_libraries(vdb_test ${OPENVDB_TEST_DEPENDENT_LIBS})
 add_test(NAME vdb_unit_test COMMAND $<TARGET_FILE:vdb_test> -v)
 
-# For the undefined behaviour sanitizer, add the suppression file and
-# additional options
-
+# For the sanitizers, add the suppression files and additional options
 get_filename_component(PATH_TO_PROJECT_ROOT ${CMAKE_CURRENT_LIST_DIR} DIRECTORY)
 get_filename_component(PATH_TO_PROJECT_ROOT ${PATH_TO_PROJECT_ROOT} DIRECTORY)
 get_filename_component(PATH_TO_PROJECT_ROOT ${PATH_TO_PROJECT_ROOT} DIRECTORY)
+set(LSAN_SUPRESSION_FILE ${PATH_TO_PROJECT_ROOT}/cmake/scripts/lsan.supp)
 set(UBSAN_SUPRESSION_FILE ${PATH_TO_PROJECT_ROOT}/cmake/scripts/ubsan.supp)
 
-set_tests_properties(vdb_unit_test PROPERTIES
-    ENVIRONMENT
-      "$<$<CONFIG:UBSAN>:UBSAN_OPTIONS=halt_on_error=1 report_error_type=1 suppressions=${UBSAN_SUPRESSION_FILE}>")
+set(UBSAN_OPTS "$<$<CONFIG:UBSAN>:UBSAN_OPTIONS=halt_on_error=1 report_error_type=1 suppressions=${UBSAN_SUPRESSION_FILE}>")
+set(LSAN_OPTS  "$<$<CONFIG:LSAN>:LSAN_OPTIONS=suppressions=${LSAN_SUPRESSION_FILE}>")
+set(ASAN_OPTS  "$<$<CONFIG:ASAN>:LSAN_OPTIONS=suppressions=${LSAN_SUPRESSION_FILE}>")
 
+set_tests_properties(vdb_unit_test PROPERTIES
+    ENVIRONMENT "$<JOIN:${UBSAN_OPTS};${LSAN_OPTS};${ASAN_OPTS}, >")
diff --git a/openvdb/openvdb/unittest/TestAttributeArray.cc b/openvdb/openvdb/unittest/TestAttributeArray.cc
index 30ec17aab5..65fabbec0d 100644
--- a/openvdb/openvdb/unittest/TestAttributeArray.cc
+++ b/openvdb/openvdb/unittest/TestAttributeArray.cc
@@ -6,6 +6,7 @@
 #include <openvdb/Types.h>
 #include <openvdb/math/Transform.h>
 #include <openvdb/io/File.h>
+#include <openvdb/util/Assert.h>
 
 #include <gtest/gtest.h>
 
@@ -892,8 +893,8 @@ struct VectorWrapper
     VectorWrapper(const T& _data) : data(_data) { }
     operator bool() const { return index < data.size(); }
     VectorWrapper& operator++() { index++; return *this; }
-    Index sourceIndex() const { assert(*this); return data[index].first; }
-    Index targetIndex() const { assert(*this); return data[index].second; }
+    Index sourceIndex() const { OPENVDB_ASSERT(*this); return data[index].first; }
+    Index targetIndex() const { OPENVDB_ASSERT(*this); return data[index].second; }
 
 private:
     const T& data;
diff --git a/openvdb/openvdb/unittest/TestLinearInterp.cc b/openvdb/openvdb/unittest/TestLinearInterp.cc
index 944f0ef60b..137c81f7de 100644
--- a/openvdb/openvdb/unittest/TestLinearInterp.cc
+++ b/openvdb/openvdb/unittest/TestLinearInterp.cc
@@ -999,7 +999,7 @@ template<typename GridType>
 void
 TestLinearInterp::testStencilsMatch()
 {
-    typedef typename GridType::ValueType ValueType;
+    using ValueType = typename GridType::ValueType;
 
     GridType grid;
     typename GridType::TreeType& tree = grid.tree();
@@ -1022,14 +1022,13 @@ TestLinearInterp::testStencilsMatch()
         openvdb::tools::GridSampler<GridType, openvdb::tools::BoxSampler>
             interpolator(grid);
 
-        openvdb::math::BoxStencil<const GridType>
-            stencil(grid);
-
-        typename GridType::ValueType val1 = interpolator.sampleVoxel(pos.x(), pos.y(), pos.z());
+        openvdb::math::BoxStencil<const GridType> stencil(grid);
 
+        const ValueType val1 = interpolator.sampleVoxel(pos.x(), pos.y(), pos.z());
         stencil.moveTo(pos);
-        typename GridType::ValueType val2 = stencil.interpolation(pos);
-        EXPECT_EQ(val1, val2);
+        const ValueType val2 = stencil.interpolation(pos);
+        static const ValueType epsilon = openvdb::math::Delta<ValueType>::value();
+        EXPECT_NEAR(val1, val2, epsilon);
     }
 }
 TEST_F(TestLinearInterp, testStencilsMatchFloat) { testStencilsMatch<openvdb::FloatGrid>(); }
diff --git a/openvdb/openvdb/unittest/TestPointRasterizeTrilinear.cc b/openvdb/openvdb/unittest/TestPointRasterizeTrilinear.cc
index cc23dbcdb7..6346e98f9e 100644
--- a/openvdb/openvdb/unittest/TestPointRasterizeTrilinear.cc
+++ b/openvdb/openvdb/unittest/TestPointRasterizeTrilinear.cc
@@ -7,6 +7,7 @@
 #include <openvdb/points/PointConversion.h>
 #include <openvdb/points/PointScatter.h>
 #include <openvdb/points/PointRasterizeTrilinear.h>
+#include <openvdb/util/Assert.h>
 
 #include <gtest/gtest.h>
 
@@ -244,7 +245,7 @@ TEST_F(TestPointRasterize, tetsSingleTreeRasterize)
                 for (c.y() = a.y(); c.y() <= b.y(); ++c.y()) {
                     const Index j = ((c.y() & (DIM-1u)) << LOG2DIM);
                     for (c.z() = a.z(); c.z() <= b.z(); ++c.z()) {
-                        assert(bounds.isInside(c));
+                        OPENVDB_ASSERT(bounds.isInside(c));
                         const Index offset = i + j + /*k*/(c.z() & (DIM-1u));
                         if (!mask.isOn(offset)) continue;
                         data[offset] += 1;
@@ -340,7 +341,7 @@ TEST_F(TestPointRasterize, testMultiTreeRasterize)
                 for (c.y() = a.y(); c.y() <= b.y(); ++c.y()) {
                     const Index j = ((c.y() & (DIM-1u)) << LOG2DIM);
                     for (c.z() = a.z(); c.z() <= b.z(); ++c.z()) {
-                        assert(bounds.isInside(c));
+                        OPENVDB_ASSERT(bounds.isInside(c));
                         const Index offset = i + j + /*k*/(c.z() & (DIM-1u));
                         if (!mask.isOn(offset)) continue;
                         data1[offset] += static_cast<int>(vec.length());
diff --git a/openvdb/openvdb/unittest/TestStreamCompression.cc b/openvdb/openvdb/unittest/TestStreamCompression.cc
index eb4c760a9c..a7088d9dee 100644
--- a/openvdb/openvdb/unittest/TestStreamCompression.cc
+++ b/openvdb/openvdb/unittest/TestStreamCompression.cc
@@ -20,15 +20,9 @@
 #include <boost/interprocess/mapped_region.hpp>
 #include <boost/iostreams/device/array.hpp>
 #include <boost/iostreams/stream.hpp>
-#include <boost/uuid/uuid_generators.hpp>
-#include <boost/uuid/uuid_io.hpp>
-#include <boost/version.hpp> // for BOOST_VERSION
 
 #ifdef _WIN32
 #include <boost/interprocess/detail/os_file_functions.hpp> // open_existing_file(), close_file()
-// boost::interprocess::detail was renamed to boost::interprocess::ipcdetail in Boost 1.48.
-// Ensure that both namespaces exist.
-namespace boost { namespace interprocess { namespace detail {} namespace ipcdetail {} } }
 #include <windows.h>
 #else
 #include <sys/types.h> // for struct stat
@@ -43,16 +37,6 @@ namespace boost { namespace interprocess { namespace detail {} namespace ipcdeta
 
 #ifdef OPENVDB_USE_BLOSC
 #include <blosc.h>
-// A Blosc optimization introduced in 1.11.0 uses a slightly smaller block size for
-// HCR codecs (LZ4, ZLIB, ZSTD), which otherwise fails a few regression test cases
-#if BLOSC_VERSION_MAJOR > 1 || (BLOSC_VERSION_MAJOR == 1 && BLOSC_VERSION_MINOR > 10)
-#define BLOSC_HCR_BLOCKSIZE_OPTIMIZATION
-#endif
-// Blosc 1.14+ writes backwards-compatible data by default.
-// http://blosc.org/posts/new-forward-compat-policy/
-#if BLOSC_VERSION_MAJOR > 1 || (BLOSC_VERSION_MAJOR == 1 &&  BLOSC_VERSION_MINOR >= 14)
-#define BLOSC_BACKWARDS_COMPATIBLE
-#endif
 #endif
 
 #ifdef OPENVDB_USE_DELAYED_LOADING
@@ -74,7 +58,6 @@ class ProxyMappedFile
             mLastWriteTime = 0;
             const char* regionFilename = mMap.get_name();
 #ifdef _WIN32
-            using namespace boost::interprocess::detail;
             using namespace boost::interprocess::ipcdetail;
             using openvdb::Index64;
 
diff --git a/openvdb/openvdb/unittest/TestVolumeRayIntersector.cc b/openvdb/openvdb/unittest/TestVolumeRayIntersector.cc
index 89613c9658..dc061b9488 100644
--- a/openvdb/openvdb/unittest/TestVolumeRayIntersector.cc
+++ b/openvdb/openvdb/unittest/TestVolumeRayIntersector.cc
@@ -13,7 +13,6 @@
 
 #include <gtest/gtest.h>
 
-#include <cassert>
 #include <deque>
 #include <iostream>
 #include <vector>
diff --git a/openvdb/openvdb/util/Assert.cc b/openvdb/openvdb/util/Assert.cc
new file mode 100644
index 0000000000..540293b2b3
--- /dev/null
+++ b/openvdb/openvdb/util/Assert.cc
@@ -0,0 +1,34 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+///
+/// @file Platform.h
+
+#include "Assert.h"
+
+#include <cstdio>
+#include <cstdlib>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+
+[[noreturn]] void assertAbort(
+    const char *assertion,
+    const char *file,
+    const unsigned line,
+    const char *function,
+    const char* msg)
+{
+    std::fprintf(stderr, "%s:%u:", file, line);
+    std::fprintf(stderr, " Assertion failed: ");
+    std::fprintf(stderr, "'%s'", assertion);
+    std::fprintf(stderr, " in function: ");
+    std::fprintf(stderr, "'%s'", function);
+    if (msg) std::fprintf(stderr, "\n%s", msg);
+    std::fprintf(stderr, "\n");
+    // @todo  could make this optional with another compile define
+    std::abort();
+}
+
+}
+}
diff --git a/openvdb/openvdb/util/Assert.h b/openvdb/openvdb/util/Assert.h
new file mode 100644
index 0000000000..aa56f911c2
--- /dev/null
+++ b/openvdb/openvdb/util/Assert.h
@@ -0,0 +1,45 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+///
+/// @file Assert.h
+
+#ifndef OPENVDB_UTIL_ASSERT_HAS_BEEN_INCLUDED
+#define OPENVDB_UTIL_ASSERT_HAS_BEEN_INCLUDED
+
+#include <openvdb/Platform.h>
+#include <openvdb/version.h>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+
+/// @brief  Trigger a SIGABRT after printing a formatted assertion message.
+///   Effectively performs the same functionality as cassert, but allows
+///   VDB code to call this independently of the NDEBUG define.
+/// @param assertion  The variable or expression that triggered the assertion
+///   as a string
+/// @param file  The name of the file the assertion occurred
+/// @param line  The line in the file the assertion occurred
+/// @param function  The name of the function the assertion occurred in
+/// @param msg  An optional descriptive message
+[[noreturn]] void assertAbort(
+    const char *assertion,
+    const char *file,
+    const unsigned line,
+    const char *function,
+    const char* msg = nullptr);
+
+}
+}
+
+#ifdef OPENVDB_ENABLE_ASSERTS
+#define OPENVDB_ASSERT(X) \
+  (OPENVDB_LIKELY(X) ? (void)0 : openvdb::assertAbort(#X, __FILE__, __LINE__, __PRETTY_FUNCTION__))
+#define OPENVDB_ASSERT_MESSAGE(X, MSG) \
+  (OPENVDB_LIKELY(X) ? (void)0 : openvdb::assertAbort(#X, __FILE__, __LINE__, __PRETTY_FUNCTION__, MSG))
+#else
+#define OPENVDB_ASSERT(X) (void)0;
+#define OPENVDB_ASSERT_MESSAGE(X, MSG) (void)0;
+#endif // OPENVDB_ENABLE_ASSERTS
+
+#endif // OPENVDB_UTIL_ASSERT_HAS_BEEN_INCLUDED
diff --git a/openvdb/openvdb/util/NodeMasks.h b/openvdb/openvdb/util/NodeMasks.h
index accffe4f1e..4d8c212d59 100644
--- a/openvdb/openvdb/util/NodeMasks.h
+++ b/openvdb/openvdb/util/NodeMasks.h
@@ -9,11 +9,11 @@
 #define OPENVDB_UTIL_NODEMASKS_HAS_BEEN_INCLUDED
 
 #include <algorithm> // for std::min()
-#include <cassert>
 #include <cstring>
 #include <iostream>// for cout
 #include <openvdb/Platform.h>
 #include <openvdb/Types.h>
+#include <openvdb/util/Assert.h>
 //#include <strings.h> // for ffs
 
 
@@ -84,7 +84,7 @@ inline Index32 CountOff(Index64 v) { return CountOn(~v); }
 inline Index32
 FindLowestOn(Byte v)
 {
-    assert(v);
+    OPENVDB_ASSERT(v);
 #if defined(OPENVDB_USE_SSE42) && defined(_MSC_VER)
     unsigned long index;
     _BitScanForward(&index, static_cast<Index32>(v));
@@ -102,7 +102,7 @@ FindLowestOn(Byte v)
 inline Index32
 FindLowestOn(Index32 v)
 {
-    assert(v);
+    OPENVDB_ASSERT(v);
     //return ffs(v);
     static const Byte DeBruijn[32] = {
         0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
@@ -124,7 +124,7 @@ FindLowestOn(Index32 v)
 inline Index32
 FindLowestOn(Index64 v)
 {
-    assert(v);
+    OPENVDB_ASSERT(v);
 #if defined(OPENVDB_USE_SSE42) && defined(_MSC_VER)
     unsigned long index;
     _BitScanForward64(&index, v);
@@ -187,7 +187,7 @@ class BaseMaskIterator
     BaseMaskIterator(const BaseMaskIterator&) = default;
     BaseMaskIterator(Index32 pos, const NodeMask* parent): mPos(pos), mParent(parent)
     {
-        assert((parent == nullptr && pos == 0) || (parent != nullptr && pos <= NodeMask::SIZE));
+        OPENVDB_ASSERT((parent == nullptr && pos == 0) || (parent != nullptr && pos <= NodeMask::SIZE));
     }
     bool operator==(const BaseMaskIterator &iter) const {return mPos == iter.mPos;}
     bool operator!=(const BaseMaskIterator &iter) const {return mPos != iter.mPos;}
@@ -198,7 +198,7 @@ class BaseMaskIterator
     }
     Index32 offset() const { return mPos; }
     Index32 pos() const { return mPos; }
-    bool test() const { assert(mPos <= NodeMask::SIZE); return (mPos != NodeMask::SIZE); }
+    bool test() const { OPENVDB_ASSERT(mPos <= NodeMask::SIZE); return (mPos != NodeMask::SIZE); }
     operator bool() const { return this->test(); }
 }; // class BaseMaskIterator
 
@@ -216,9 +216,9 @@ class OnMaskIterator: public BaseMaskIterator<NodeMask>
     OnMaskIterator(Index32 pos,const NodeMask *parent) : BaseType(pos,parent) {}
     void increment()
     {
-        assert(mParent != nullptr);
+        OPENVDB_ASSERT(mParent != nullptr);
         mPos = mParent->findNextOn(mPos+1);
-        assert(mPos <= NodeMask::SIZE);
+        OPENVDB_ASSERT(mPos <= NodeMask::SIZE);
     }
     void increment(Index n) { while(n-- && this->next()) ; }
     bool next()
@@ -247,9 +247,9 @@ class OffMaskIterator: public BaseMaskIterator<NodeMask>
     OffMaskIterator(Index32 pos,const NodeMask *parent) : BaseType(pos,parent) {}
     void increment()
     {
-        assert(mParent != nullptr);
+        OPENVDB_ASSERT(mParent != nullptr);
         mPos=mParent->findNextOff(mPos+1);
-        assert(mPos <= NodeMask::SIZE);
+        OPENVDB_ASSERT(mPos <= NodeMask::SIZE);
     }
     void increment(Index n) { while(n-- && this->next()) ; }
     bool next()
@@ -279,9 +279,9 @@ class DenseMaskIterator: public BaseMaskIterator<NodeMask>
     DenseMaskIterator(Index32 pos,const NodeMask *parent) : BaseType(pos,parent) {}
     void increment()
     {
-        assert(mParent != nullptr);
+        OPENVDB_ASSERT(mParent != nullptr);
         mPos += 1;//careful - the increment might go beyond the end
-        assert(mPos<= NodeMask::SIZE);
+        OPENVDB_ASSERT(mPos<= NodeMask::SIZE);
     }
     void increment(Index n) { while(n-- && this->next()) ; }
     bool next()
@@ -450,12 +450,12 @@ class NodeMask
     Index32 countOff() const { return SIZE-this->countOn(); }
     /// Set the <i>n</i>th  bit on
     void setOn(Index32 n) {
-        assert( (n >> 6) < WORD_COUNT );
+        OPENVDB_ASSERT( (n >> 6) < WORD_COUNT );
         mWords[n >> 6] |=  Word(1) << (n & 63);
     }
     /// Set the <i>n</i>th bit off
     void setOff(Index32 n) {
-        assert( (n >> 6) < WORD_COUNT );
+        OPENVDB_ASSERT( (n >> 6) < WORD_COUNT );
         mWords[n >> 6] &=  ~(Word(1) << (n & 63));
     }
     /// Set the <i>n</i>th bit to the specified state
@@ -481,7 +481,7 @@ class NodeMask
     }
     /// Toggle the state of the <i>n</i>th bit
     void toggle(Index32 n) {
-        assert( (n >> 6) < WORD_COUNT );
+        OPENVDB_ASSERT( (n >> 6) < WORD_COUNT );
         mWords[n >> 6] ^= Word(1) << (n & 63);
     }
     /// Toggle the state of all bits in the mask
@@ -501,7 +501,7 @@ class NodeMask
     /// Return @c true if the <i>n</i>th bit is on
     bool isOn(Index32 n) const
     {
-        assert( (n >> 6) < WORD_COUNT );
+        OPENVDB_ASSERT( (n >> 6) < WORD_COUNT );
         return 0 != (mWords[n >> 6] & (Word(1) << (n & 63)));
     }
     /// Return @c true if the <i>n</i>th bit is off
@@ -551,13 +551,13 @@ class NodeMask
     template<typename WordT>
     WordT getWord(Index n) const
     {
-        assert(n*8*sizeof(WordT) < SIZE);
+        OPENVDB_ASSERT(n*8*sizeof(WordT) < SIZE);
         return reinterpret_cast<const WordT*>(mWords)[n];
     }
     template<typename WordT>
     WordT& getWord(Index n)
     {
-        assert(n*8*sizeof(WordT) < SIZE);
+        OPENVDB_ASSERT(n*8*sizeof(WordT) < SIZE);
         return reinterpret_cast<WordT*>(mWords)[n];
     }
     //@}
@@ -726,12 +726,12 @@ class NodeMask<1>
     Index32 countOff() const { return CountOff(mByte); }
     /// Set the <i>n</i>th  bit on
     void setOn(Index32 n) {
-        assert( n  < 8 );
+        OPENVDB_ASSERT( n  < 8 );
         mByte = static_cast<Byte>(mByte | 0x01U << (n & 7));
     }
     /// Set the <i>n</i>th bit off
     void setOff(Index32 n) {
-        assert( n  < 8 );
+        OPENVDB_ASSERT( n  < 8 );
         mByte = static_cast<Byte>(mByte & ~(0x01U << (n & 7)));
     }
     /// Set the <i>n</i>th bit to the specified state
@@ -744,7 +744,7 @@ class NodeMask<1>
     void setOff() { mByte = 0x00U; }
     /// Toggle the state of the <i>n</i>th bit
     void toggle(Index32 n) {
-        assert( n  < 8 );
+        OPENVDB_ASSERT( n  < 8 );
         mByte = static_cast<Byte>(mByte ^ 0x01U << (n & 7));
     }
     /// Toggle the state of all bits in the mask
@@ -760,7 +760,7 @@ class NodeMask<1>
     /// Return true if the <i>n</i>th bit is on
     bool isOn(Index32 n) const
     {
-        assert( n  < 8 );
+        OPENVDB_ASSERT( n  < 8 );
         return mByte & (0x01U << (n & 7));
     }
     /// Return true if the <i>n</i>th bit is off
@@ -791,14 +791,14 @@ class NodeMask<1>
     WordT getWord(Index n) const
     {
         static_assert(sizeof(WordT) == sizeof(Byte), "expected word size to be one byte");
-        assert(n == 0);
+        OPENVDB_ASSERT(n == 0);
         return reinterpret_cast<WordT>(mByte);
     }
     template<typename WordT>
     WordT& getWord(Index n)
     {
         static_assert(sizeof(WordT) == sizeof(Byte), "expected word size to be one byte");
-        assert(n == 0);
+        OPENVDB_ASSERT(n == 0);
         return reinterpret_cast<WordT&>(mByte);
     }
     //@}
@@ -948,12 +948,12 @@ class NodeMask<2>
     Index32 countOff() const { return CountOff(mWord); }
     /// Set the <i>n</i>th  bit on
     void setOn(Index32 n) {
-        assert( n  < 64 );
+        OPENVDB_ASSERT( n  < 64 );
         mWord |= UINT64_C(0x01) << (n & 63);
     }
     /// Set the <i>n</i>th bit off
     void setOff(Index32 n) {
-        assert( n  < 64 );
+        OPENVDB_ASSERT( n  < 64 );
         mWord &= ~(UINT64_C(0x01) << (n & 63));
     }
     /// Set the <i>n</i>th bit to the specified state
@@ -966,7 +966,7 @@ class NodeMask<2>
     void setOff() { mWord = UINT64_C(0x00); }
     /// Toggle the state of the <i>n</i>th bit
     void toggle(Index32 n) {
-        assert( n  < 64 );
+        OPENVDB_ASSERT( n  < 64 );
         mWord ^= UINT64_C(0x01) << (n & 63);
     }
     /// Toggle the state of all bits in the mask
@@ -982,7 +982,7 @@ class NodeMask<2>
     /// Return true if the <i>n</i>th bit is on
     bool isOn(Index32 n) const
     {
-        assert( n  < 64 );
+        OPENVDB_ASSERT( n  < 64 );
         return 0 != (mWord & (UINT64_C(0x01) << (n & 63)));
     }
     /// Return true if the <i>n</i>th bit is off
@@ -1009,13 +1009,13 @@ class NodeMask<2>
     template<typename WordT>
     WordT getWord(Index n) const
     {
-        assert(n*8*sizeof(WordT) < SIZE);
+        OPENVDB_ASSERT(n*8*sizeof(WordT) < SIZE);
         return reinterpret_cast<const WordT*>(&mWord)[n];
     }
     template<typename WordT>
     WordT& getWord(Index n)
     {
-        assert(n*8*sizeof(WordT) < SIZE);
+        OPENVDB_ASSERT(n*8*sizeof(WordT) < SIZE);
         return reinterpret_cast<WordT*>(mWord)[n];
     }
     //@}
@@ -1116,7 +1116,7 @@ class RootNodeMask
         BaseIterator() : mPos(0), mBitSize(0), mParent(nullptr) {}
         BaseIterator(const BaseIterator&) = default;
         BaseIterator(Index32 pos, const RootNodeMask* parent):
-            mPos(pos), mBitSize(parent->getBitSize()), mParent(parent) { assert(pos <= mBitSize); }
+            mPos(pos), mBitSize(parent->getBitSize()), mParent(parent) { OPENVDB_ASSERT(pos <= mBitSize); }
         bool operator==(const BaseIterator &iter) const {return mPos == iter.mPos;}
         bool operator!=(const BaseIterator &iter) const {return mPos != iter.mPos;}
         bool operator< (const BaseIterator &iter) const {return mPos <  iter.mPos;}
@@ -1132,7 +1132,7 @@ class RootNodeMask
         Index32 pos() const {return mPos;}
 
         bool test() const {
-            assert(mPos  <= mBitSize);
+            OPENVDB_ASSERT(mPos  <= mBitSize);
             return (mPos != mBitSize);
         }
 
@@ -1150,9 +1150,9 @@ class RootNodeMask
         OnIterator() : BaseIterator() {}
         OnIterator(Index32 pos,const RootNodeMask *parent) : BaseIterator(pos,parent) {}
         void increment() {
-            assert(mParent != nullptr);
+            OPENVDB_ASSERT(mParent != nullptr);
             mPos=mParent->findNextOn(mPos+1);
-            assert(mPos <= mBitSize);
+            OPENVDB_ASSERT(mPos <= mBitSize);
         }
         void increment(Index n) {
             for (Index i=0; i<n && this->next(); ++i) {}
@@ -1178,9 +1178,9 @@ class RootNodeMask
         OffIterator() : BaseIterator()  {}
         OffIterator(Index32 pos,const RootNodeMask *parent) : BaseIterator(pos,parent) {}
         void increment() {
-            assert(mParent != nullptr);
+            OPENVDB_ASSERT(mParent != nullptr);
             mPos=mParent->findNextOff(mPos+1);
-            assert(mPos <= mBitSize);
+            OPENVDB_ASSERT(mPos <= mBitSize);
         }
         void increment(Index n) {
             for (Index i=0; i<n && this->next(); ++i) {}
@@ -1206,9 +1206,9 @@ class RootNodeMask
         DenseIterator() : BaseIterator() {}
         DenseIterator(Index32 pos,const RootNodeMask *parent) : BaseIterator(pos,parent) {}
         void increment() {
-            assert(mParent != nullptr);
+            OPENVDB_ASSERT(mParent != nullptr);
             mPos += 1;//carefull - the increament might go beyond the end
-            assert(mPos<= mBitSize);
+            OPENVDB_ASSERT(mPos<= mBitSize);
         }
         void increment(Index n) {
             for (Index i=0; i<n && this->next(); ++i) {}
@@ -1248,7 +1248,7 @@ class RootNodeMask
     //
     RootNodeMask operator!() const { RootNodeMask m = *this; m.toggle(); return m; }
     const RootNodeMask& operator&=(const RootNodeMask& other) {
-        assert(mIntSize == other.mIntSize);
+        OPENVDB_ASSERT(mIntSize == other.mIntSize);
         for (Index32 i = 0, N = std::min(mIntSize, other.mIntSize); i < N; ++i) {
             mBits[i] &= other.mBits[i];
         }
@@ -1256,14 +1256,14 @@ class RootNodeMask
         return *this;
     }
     const RootNodeMask& operator|=(const RootNodeMask& other) {
-        assert(mIntSize == other.mIntSize);
+        OPENVDB_ASSERT(mIntSize == other.mIntSize);
         for (Index32 i = 0, N = std::min(mIntSize, other.mIntSize); i < N; ++i) {
             mBits[i] |= other.mBits[i];
         }
         return *this;
     }
     const RootNodeMask& operator^=(const RootNodeMask& other) {
-        assert(mIntSize == other.mIntSize);
+        OPENVDB_ASSERT(mIntSize == other.mIntSize);
         for (Index32 i = 0, N = std::min(mIntSize, other.mIntSize); i < N; ++i) {
             mBits[i] ^= other.mBits[i];
         }
@@ -1285,7 +1285,7 @@ class RootNodeMask
     }
 
     Index32 countOn() const {
-        assert(mBits);
+        OPENVDB_ASSERT(mBits);
         Index32 n=0;
         for (Index32 i=0; i< mIntSize; ++i) n += CountOn(mBits[i]);
         return n;
@@ -1294,34 +1294,34 @@ class RootNodeMask
     Index32 countOff() const { return mBitSize-this->countOn(); }
 
     void setOn(Index32 i) {
-        assert(mBits);
-        assert( (i>>5) < mIntSize);
+        OPENVDB_ASSERT(mBits);
+        OPENVDB_ASSERT( (i>>5) < mIntSize);
         mBits[i>>5] |=  1<<(i&31);
     }
 
     void setOff(Index32 i) {
-        assert(mBits);
-        assert( (i>>5) < mIntSize);
+        OPENVDB_ASSERT(mBits);
+        OPENVDB_ASSERT( (i>>5) < mIntSize);
         mBits[i>>5] &=  ~(1<<(i&31));
     }
 
     void set(Index32 i, bool On) { On ? this->setOn(i) : this->setOff(i); }
 
     void setOn() {
-        assert(mBits);
+        OPENVDB_ASSERT(mBits);
         for (Index32 i=0; i<mIntSize; ++i) mBits[i]=0xFFFFFFFF;
     }
     void setOff() {
-        assert(mBits);
+        OPENVDB_ASSERT(mBits);
         for (Index32 i=0; i<mIntSize; ++i) mBits[i]=0x00000000;
     }
     void toggle(Index32 i) {
-        assert(mBits);
-        assert( (i>>5) < mIntSize);
+        OPENVDB_ASSERT(mBits);
+        OPENVDB_ASSERT( (i>>5) < mIntSize);
         mBits[i>>5] ^= 1<<(i&31);
     }
     void toggle() {
-        assert(mBits);
+        OPENVDB_ASSERT(mBits);
         for (Index32 i=0; i<mIntSize; ++i) mBits[i]=~mBits[i];
     }
     void setFirstOn()  { this->setOn(0); }
@@ -1329,13 +1329,13 @@ class RootNodeMask
     void setFirstOff() { this->setOff(0); }
     void setLastOff()  { this->setOff(mBitSize-1); }
     bool isOn(Index32 i) const {
-        assert(mBits);
-        assert( (i>>5) < mIntSize);
+        OPENVDB_ASSERT(mBits);
+        OPENVDB_ASSERT( (i>>5) < mIntSize);
         return ( mBits[i >> 5] & (1<<(i&31)) );
     }
     bool isOff(Index32 i) const {
-        assert(mBits);
-        assert( (i>>5) < mIntSize);
+        OPENVDB_ASSERT(mBits);
+        OPENVDB_ASSERT( (i>>5) < mIntSize);
         return ( ~mBits[i >> 5] & (1<<(i&31)) );
     }
 
@@ -1352,29 +1352,29 @@ class RootNodeMask
     }
 
     Index32 findFirstOn() const {
-        assert(mBits);
+        OPENVDB_ASSERT(mBits);
         Index32 i=0;
         while(!mBits[i]) if (++i == mIntSize) return mBitSize;//reached end
         return 32*i + FindLowestOn(mBits[i]);
     }
 
     Index32 findFirstOff() const {
-        assert(mBits);
+        OPENVDB_ASSERT(mBits);
         Index32 i=0;
         while(!(~mBits[i])) if (++i == mIntSize) return mBitSize;//reached end
         return 32*i + FindLowestOn(~mBits[i]);
     }
 
     void save(std::ostream& os) const {
-        assert(mBits);
+        OPENVDB_ASSERT(mBits);
         os.write(reinterpret_cast<const char*>(mBits), mIntSize * sizeof(Index32));
     }
     void load(std::istream& is) {
-        assert(mBits);
+        OPENVDB_ASSERT(mBits);
         is.read(reinterpret_cast<char*>(mBits), mIntSize * sizeof(Index32));
     }
     void seek(std::istream& is) const {
-        assert(mBits);
+        OPENVDB_ASSERT(mBits);
         is.seekg(mIntSize * sizeof(Index32), std::ios_base::cur);
     }
     /// @brief simple print method for debugging
@@ -1400,7 +1400,7 @@ class RootNodeMask
     }
 
     Index32 findNextOn(Index32 start) const {
-        assert(mBits);
+        OPENVDB_ASSERT(mBits);
         Index32 n = start >> 5, m = start & 31;//initiate
         if (n>=mIntSize) return mBitSize; // check for out of bounds
         Index32 b = mBits[n];
@@ -1411,7 +1411,7 @@ class RootNodeMask
     }
 
     Index32 findNextOff(Index32 start) const {
-        assert(mBits);
+        OPENVDB_ASSERT(mBits);
         Index32 n = start >> 5, m = start & 31;//initiate
         if (n>=mIntSize) return mBitSize; // check for out of bounds
         Index32 b = ~mBits[n];
@@ -1422,7 +1422,7 @@ class RootNodeMask
     }
 
     Index32 memUsage() const {
-        assert(mBits);
+        OPENVDB_ASSERT(mBits);
         return static_cast<Index32>(sizeof(Index32*)+(2+mIntSize)*sizeof(Index32));//in bytes
     }
 }; // class RootNodeMask
diff --git a/openvdb/openvdb/util/PagedArray.h b/openvdb/openvdb/util/PagedArray.h
index cfacd1f994..f98e621f40 100644
--- a/openvdb/openvdb/util/PagedArray.h
+++ b/openvdb/openvdb/util/PagedArray.h
@@ -17,8 +17,8 @@
 
 #include <openvdb/version.h>
 #include <openvdb/Types.h>// SharedPtr
+#include <openvdb/util/Assert.h>
 #include <deque>
-#include <cassert>
 #include <iostream>
 #include <iterator>
 #include <algorithm>// std::swap
@@ -215,7 +215,7 @@ class PagedArray
     /// @warning It is assumed that the i'th element is already allocated!
     ValueType& operator[](size_t i)
     {
-        assert(i<mCapacity);
+        OPENVDB_ASSERT(i<mCapacity);
         return (*mPageTable[i>>Log2PageSize])[i];
     }
 
@@ -228,7 +228,7 @@ class PagedArray
     /// @warning It is assumed that the i'th element is already allocated!
     const ValueType& operator[](size_t i) const
     {
-        assert(i<mCapacity);
+        OPENVDB_ASSERT(i<mCapacity);
         return (*mPageTable[i>>Log2PageSize])[i];
     }
 
@@ -500,7 +500,7 @@ void PagedArray<ValueT, Log2PageSize>::merge(PagedArray& other)
 template <typename ValueT, size_t Log2PageSize>
 void PagedArray<ValueT, Log2PageSize>::add_full(Page*& page, size_t size)
 {
-    assert(size == Page::Size);//page must be full
+    OPENVDB_ASSERT(size == Page::Size);//page must be full
     if (mSize & Page::Mask) {//page-table is partially full
         Page*& tmp = mPageTable.back();
         std::swap(tmp, page);//swap last table entry with page
@@ -514,7 +514,7 @@ void PagedArray<ValueT, Log2PageSize>::add_full(Page*& page, size_t size)
 template <typename ValueT, size_t Log2PageSize>
 void PagedArray<ValueT, Log2PageSize>::add_partially_full(Page*& page, size_t size)
 {
-    assert(size > 0 && size < Page::Size);//page must be partially full
+    OPENVDB_ASSERT(size > 0 && size < Page::Size);//page must be partially full
     if (size_t m = mSize & Page::Mask) {//page table is also partially full
         ValueT *s = page->data(), *t = mPageTable.back()->data() + m;
         for (size_t i=std::min(mSize+size, mCapacity)-mSize; i; --i) *t++ = *s++;
diff --git a/openvdb/openvdb/version.h.in b/openvdb/openvdb/version.h.in
index 9a95b8d3a5..54a7b3742d 100644
--- a/openvdb/openvdb/version.h.in
+++ b/openvdb/openvdb/version.h.in
@@ -143,6 +143,11 @@
 #cmakedefine OPENVDB_USE_DELAYED_LOADING
 #endif
 
+/* Denotes whether VDB was built asserts enabled in VDB code */
+#ifndef OPENVDB_ENABLE_ASSERTS
+#cmakedefine OPENVDB_ENABLE_ASSERTS
+#endif
+
 /* Denotes whether VDB was built with explicit template instantiation */
 #ifndef OPENVDB_USE_EXPLICIT_INSTANTIATION
 #cmakedefine OPENVDB_USE_EXPLICIT_INSTANTIATION
@@ -160,6 +165,7 @@
 #define OPENVDB_VOLUME_TREE_INSTANTIATE(Function)   @OPENVDB_VOLUME_TREE_INSTANTIATIONS@
 #define OPENVDB_ALL_TREE_INSTANTIATE(Function)      @OPENVDB_ALL_TREE_INSTANTIATIONS@
 
+
 ///////////////////////////////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////
 
diff --git a/openvdb_ax/openvdb_ax/CMakeLists.txt b/openvdb_ax/openvdb_ax/CMakeLists.txt
index 61af2d8ede..f2e718683b 100644
--- a/openvdb_ax/openvdb_ax/CMakeLists.txt
+++ b/openvdb_ax/openvdb_ax/CMakeLists.txt
@@ -306,6 +306,10 @@ if(OPENVDB_AX_STATIC)
     PROPERTIES OUTPUT_NAME openvdb_ax
   )
 
+  if(WIN32)
+    set_target_properties(openvdb_ax_static PROPERTIES PREFIX "lib")
+  endif()
+
   target_link_libraries(openvdb_ax_static PUBLIC ${OPENVDB_AX_CORE_DEPENDENT_LIBS})
 
   # @todo fix opaque pointer requirements
diff --git a/openvdb_ax/openvdb_ax/ast/AST.h b/openvdb_ax/openvdb_ax/ast/AST.h
index 9ccbbe72a1..c27adba5d9 100644
--- a/openvdb_ax/openvdb_ax/ast/AST.h
+++ b/openvdb_ax/openvdb_ax/ast/AST.h
@@ -30,6 +30,7 @@
 #include "Tokens.h"
 
 #include <openvdb/version.h>
+#include <openvdb/util/Assert.h>
 
 #include <memory>
 #include <utility>
@@ -278,7 +279,7 @@ struct Node
         bool hasChild = false;
         for (size_t i = 0; i < parent->children(); ++i)
             hasChild |= parent->child(i) == this;
-        assert(hasChild);
+        OPENVDB_ASSERT(hasChild);
 #endif
         mParent = parent;
     }
@@ -729,16 +730,16 @@ struct Loop : public Statement
         , mBody(body)
         , mInitial(init)
         , mIteration(iter) {
-            assert(mConditional);
-            assert(mBody);
+            OPENVDB_ASSERT(mConditional);
+            OPENVDB_ASSERT(mBody);
             mConditional->setParent(this);
             mBody->setParent(this);
             if (mInitial) {
-                assert(mLoopType == tokens::LoopToken::FOR);
+                OPENVDB_ASSERT(mLoopType == tokens::LoopToken::FOR);
                 mInitial->setParent(this);
             }
             if (mIteration) {
-                assert(mLoopType == tokens::LoopToken::FOR);
+                OPENVDB_ASSERT(mLoopType == tokens::LoopToken::FOR);
                  mIteration->setParent(this);
             }
         }
@@ -755,11 +756,11 @@ struct Loop : public Statement
             mConditional->setParent(this);
             mBody->setParent(this);
             if (mInitial) {
-                assert(mLoopType == tokens::LoopToken::FOR);
+                OPENVDB_ASSERT(mLoopType == tokens::LoopToken::FOR);
                 mInitial->setParent(this);
             }
             if (mIteration) {
-                assert(mLoopType == tokens::LoopToken::FOR);
+                OPENVDB_ASSERT(mLoopType == tokens::LoopToken::FOR);
                  mIteration->setParent(this);
             }
         }
@@ -880,8 +881,8 @@ struct ConditionalStatement : public Statement
         : mConditional(conditional)
         , mTrueBranch(trueBlock)
         , mFalseBranch(falseBlock) {
-            assert(mConditional);
-            assert(mTrueBranch);
+            OPENVDB_ASSERT(mConditional);
+            OPENVDB_ASSERT(mTrueBranch);
             mConditional->setParent(this);
             mTrueBranch->setParent(this);
             if (mFalseBranch) mFalseBranch->setParent(this);
@@ -1002,8 +1003,8 @@ struct BinaryOperator : public Expression
         : mLeft(left)
         , mRight(right)
         , mOperation(op) {
-            assert(mLeft);
-            assert(mRight);
+            OPENVDB_ASSERT(mLeft);
+            OPENVDB_ASSERT(mRight);
             mLeft->setParent(this);
             mRight->setParent(this);
         }
@@ -1107,8 +1108,8 @@ struct TernaryOperator : public Expression
         : mConditional(conditional)
         , mTrueBranch(trueExpression)
         , mFalseBranch(falseExpression) {
-            assert(mConditional);
-            assert(mFalseBranch);
+            OPENVDB_ASSERT(mConditional);
+            OPENVDB_ASSERT(mFalseBranch);
             mConditional->setParent(this);
             if (mTrueBranch) mTrueBranch->setParent(this);
             mFalseBranch->setParent(this);
@@ -1209,8 +1210,8 @@ struct AssignExpression : public Expression
         : mLHS(lhs)
         , mRHS(rhs)
         , mOperation(op) {
-            assert(mLHS);
-            assert(mRHS);
+            OPENVDB_ASSERT(mLHS);
+            OPENVDB_ASSERT(mRHS);
             mLHS->setParent(this);
             mRHS->setParent(this);
         }
@@ -1397,7 +1398,7 @@ struct UnaryOperator : public Expression
     UnaryOperator(Expression* expr, const tokens::OperatorToken op)
         : mExpression(expr)
         , mOperation(op) {
-            assert(mExpression);
+            OPENVDB_ASSERT(mExpression);
             mExpression->setParent(this);
         }
     /// @brief Construct a new UnaryOperator with a string, delegating
@@ -1473,7 +1474,7 @@ struct Cast : public Expression
         : Expression()
         , mType(type)
         , mExpression(expr) {
-            assert(mExpression);
+            OPENVDB_ASSERT(mExpression);
             mExpression->setParent(this);
         }
     /// @brief  Deep copy constructor for a Cast node, performing a deep copy on
@@ -1701,8 +1702,8 @@ struct ArrayUnpack : public Expression
         : mIdx0(component0)
         , mIdx1(component1)
         , mExpression(expr) {
-            assert(mIdx0);
-            assert(mExpression);
+            OPENVDB_ASSERT(mIdx0);
+            OPENVDB_ASSERT(mExpression);
             mIdx0->setParent(this);
             if(mIdx1) mIdx1->setParent(this);
             mExpression->setParent(this);
@@ -2147,7 +2148,7 @@ struct DeclareLocal : public Statement
         : mType(type)
         , mLocal(local)
         , mInit(init) {
-            assert(mLocal);
+            OPENVDB_ASSERT(mLocal);
             mLocal->setParent(this);
             if (mInit) mInit->setParent(this);
         }
diff --git a/openvdb_ax/openvdb_ax/ast/Parse.cc b/openvdb_ax/openvdb_ax/ast/Parse.cc
index 4230b6e106..23ad6c27ff 100644
--- a/openvdb_ax/openvdb_ax/ast/Parse.cc
+++ b/openvdb_ax/openvdb_ax/ast/Parse.cc
@@ -14,6 +14,8 @@
 #include "../grammar/generated/axparser.h"
 #endif
 
+#include <openvdb/util/Assert.h>
+
 #include <mutex>
 #include <string>
 #include <memory>
@@ -33,7 +35,7 @@ extern YY_BUFFER_STATE ax_scan_string(const char * str);
 extern void ax_delete_buffer(YY_BUFFER_STATE buffer);
 extern void axerror (openvdb::ax::ast::Tree**, char const *s) {
     //@todo: add check for memory exhaustion
-    assert(axlog);
+    OPENVDB_ASSERT(axlog);
     axlog->error(/*starts with 'syntax error, '*/s + 14,
         {axlloc.first_line, axlloc.first_column});
 }
diff --git a/openvdb_ax/openvdb_ax/ast/PrintTree.cc b/openvdb_ax/openvdb_ax/ast/PrintTree.cc
index 17b7c62683..27a65dcac6 100644
--- a/openvdb_ax/openvdb_ax/ast/PrintTree.cc
+++ b/openvdb_ax/openvdb_ax/ast/PrintTree.cc
@@ -8,6 +8,8 @@
 #include "Tokens.h"
 #include "Visitor.h"
 
+#include <openvdb/util/Assert.h>
+
 #include <ostream>
 
 namespace openvdb {
@@ -578,7 +580,7 @@ struct ReprintVisitor : public ast::Visitor<ReprintVisitor>
 
             for (size_t i = 1; i < children; ++i) {
                 // all child statements should be declare locals
-                assert(stmtl->child(i)->nodetype() ==
+                OPENVDB_ASSERT(stmtl->child(i)->nodetype() ==
                     ast::Node::DeclareLocalNode);
 
                 mOs << ", ";
diff --git a/openvdb_ax/openvdb_ax/ast/Scanners.cc b/openvdb_ax/openvdb_ax/ast/Scanners.cc
index f0d1e73d54..a99b729743 100644
--- a/openvdb_ax/openvdb_ax/ast/Scanners.cc
+++ b/openvdb_ax/openvdb_ax/ast/Scanners.cc
@@ -6,6 +6,8 @@
 #include "Scanners.h"
 #include "Visitor.h"
 
+#include <openvdb/util/Assert.h>
+
 #include <string>
 #include <map>
 
@@ -270,7 +272,7 @@ bool usesAttribute(const ast::Node& node,
     bool found = false;
     visitNodeType<ast::Attribute>(node,
         [&](const ast::Attribute& attrib) -> bool {
-            assert(!found);
+            OPENVDB_ASSERT(!found);
             if (type != tokens::UNKNOWN) {
                 if (attrib.type() != type) return true;
             }
@@ -291,7 +293,7 @@ bool writesToAttribute(const ast::Node& node,
 
     // See if any attributes in the result vec match the given name/type
     for (const ast::Variable* var : vars) {
-        assert(var->isType<ast::Attribute>());
+        OPENVDB_ASSERT(var->isType<ast::Attribute>());
         const ast::Attribute* attrib = static_cast<const ast::Attribute*>(var);
         if (type != tokens::UNKNOWN) {
             if (attrib->type() != type) continue;
@@ -370,7 +372,7 @@ void catalogueVariables(const ast::Node& node,
             parent = child->parent();
         }
 
-        assert(read || write);
+        OPENVDB_ASSERT(read || write);
         if (readWrite && read && write)  readWrite->emplace_back(var);
         if (readOnly && read && !write)  readOnly->emplace_back(var);
         if (writeOnly && !read && write) writeOnly->emplace_back(var);
@@ -405,7 +407,7 @@ void catalogueAttributeTokens(const ast::Node& node,
         const bool write)
     {
         for (const ast::Variable* var : vars) {
-            assert(var->isType<ast::Attribute>());
+            OPENVDB_ASSERT(var->isType<ast::Attribute>());
             const ast::Attribute* attrib = static_cast<const ast::Attribute*>(var);
             auto& access = accessmap[attrib->tokenname()];
             access.first |= read;
@@ -468,8 +470,8 @@ struct UseVisitor :
                 if (!this->traverse(loop->condition())) return false;
                 if (!this->traverse(loop->body())) return false;
             }
-            assert(!loop->initial());
-            assert(!loop->iteration());
+            OPENVDB_ASSERT(!loop->initial());
+            OPENVDB_ASSERT(!loop->iteration());
         }
         else {
             if (!this->reverseChildVisits()) {
@@ -518,7 +520,7 @@ void attributeDependencyTokens(const ast::Tree& tree,
     const std::string token = ast::Attribute::tokenFromNameType(name, type);
     const ast::Variable* var = lastUse(tree, token);
     if (!var) return;
-    assert(var->isType<ast::Attribute>());
+    OPENVDB_ASSERT(var->isType<ast::Attribute>());
 
     std::vector<const ast::Variable*> deps;
     variableDependencies(*var, deps);
diff --git a/openvdb_ax/openvdb_ax/ax.cc b/openvdb_ax/openvdb_ax/ax.cc
index f885cc789b..a4a6705a4f 100644
--- a/openvdb_ax/openvdb_ax/ax.cc
+++ b/openvdb_ax/openvdb_ax/ax.cc
@@ -7,6 +7,8 @@
 #include "compiler/PointExecutable.h"
 #include "compiler/VolumeExecutable.h"
 
+#include <openvdb/util/Assert.h>
+
 #include <llvm/InitializePasses.h>
 #include <llvm/PassRegistry.h>
 #include <llvm/Config/llvm-config.h> // version numbers
@@ -35,7 +37,7 @@ void run(const char* ax, openvdb::GridBase& grid, const AttributeBindings& bindi
         //        the executable which can be used multiple times on any inputs
         const openvdb::ax::PointExecutable::Ptr exe =
             compiler.compile<openvdb::ax::PointExecutable>(ax);
-        assert(exe);
+        OPENVDB_ASSERT(exe);
 
         //Set the attribute bindings
         exe->setAttributeBindings(bindings);
@@ -49,7 +51,7 @@ void run(const char* ax, openvdb::GridBase& grid, const AttributeBindings& bindi
         //        the executable which can be used multiple times on any inputs
         const openvdb::ax::VolumeExecutable::Ptr exe =
             compiler.compile<openvdb::ax::VolumeExecutable>(ax);
-        assert(exe);
+        OPENVDB_ASSERT(exe);
 
         // Set the attribute bindings
         exe->setAttributeBindings(bindings);
@@ -81,7 +83,7 @@ void run(const char* ax, openvdb::GridPtrVec& grids, const AttributeBindings& bi
         //        the executable which can be used multiple times on any inputs
         const openvdb::ax::PointExecutable::Ptr exe =
             compiler.compile<openvdb::ax::PointExecutable>(ax);
-        assert(exe);
+        OPENVDB_ASSERT(exe);
 
         //Set the attribute bindings
         exe->setAttributeBindings(bindings);
@@ -97,7 +99,7 @@ void run(const char* ax, openvdb::GridPtrVec& grids, const AttributeBindings& bi
         //        the executable which can be used multiple times on any inputs
         const openvdb::ax::VolumeExecutable::Ptr exe =
             compiler.compile<openvdb::ax::VolumeExecutable>(ax);
-        assert(exe);
+        OPENVDB_ASSERT(exe);
 
         //Set the attribute bindings
         exe->setAttributeBindings(bindings);
diff --git a/openvdb_ax/openvdb_ax/codegen/Codecs.cc b/openvdb_ax/openvdb_ax/codegen/Codecs.cc
index e9a9d9490c..36f94c4b6c 100644
--- a/openvdb_ax/openvdb_ax/codegen/Codecs.cc
+++ b/openvdb_ax/openvdb_ax/codegen/Codecs.cc
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: MPL-2.0
 
 #include <openvdb/points/AttributeArray.h> // for native codec types
+#include <openvdb/util/Assert.h>
 
 #include "Codecs.h"
 
@@ -33,7 +34,7 @@ inline FunctionGroup::UniquePtr axtrncdecode()
         [](const std::vector<llvm::Value*>& args,
              llvm::IRBuilder<>& B) -> llvm::Value*
     {
-        assert(args.size() == 2);
+        OPENVDB_ASSERT(args.size() == 2);
         llvm::Value* out = args[0];
         llvm::Value* in = args[1];
         llvm::Type* type = in->getType()->getPointerElementType();
@@ -42,7 +43,7 @@ inline FunctionGroup::UniquePtr axtrncdecode()
         {
             in = ir_load(B, in);
             const bool intconversion = type->isIntegerTy();
-            assert(intconversion || type->isHalfTy());
+            OPENVDB_ASSERT(intconversion || type->isHalfTy());
             llvm::Value* result = intconversion ?
                 arithmeticConversion(in, B.getInt32Ty(), B) :
                 arithmeticConversion(in, B.getFloatTy(), B);
@@ -52,9 +53,9 @@ inline FunctionGroup::UniquePtr axtrncdecode()
             std::vector<llvm::Value*> outelem, inelem;
             arrayUnpack(out, outelem, B, /*load*/false);
             arrayUnpack(in, inelem, B, /*load*/true);
-            assert(outelem.size() == inelem.size());
+            OPENVDB_ASSERT(outelem.size() == inelem.size());
             const bool intconversion = inelem.front()->getType()->isIntegerTy();
-            assert(intconversion || inelem.front()->getType()->isHalfTy());
+            OPENVDB_ASSERT(intconversion || inelem.front()->getType()->isHalfTy());
 
             if (intconversion) arithmeticConversion(inelem, B.getInt32Ty(), B);
             else               arithmeticConversion(inelem, B.getFloatTy(), B);
@@ -85,7 +86,7 @@ inline FunctionGroup::UniquePtr axtrncencode()
         [](const std::vector<llvm::Value*>& args,
              llvm::IRBuilder<>& B) -> llvm::Value*
     {
-        assert(args.size() == 2);
+        OPENVDB_ASSERT(args.size() == 2);
         llvm::Value* out = args[0];
         llvm::Value* in = args[1];
         llvm::Type* type = in->getType()->getPointerElementType();
@@ -94,7 +95,7 @@ inline FunctionGroup::UniquePtr axtrncencode()
         {
             in = ir_load(B, in);
             const bool intconversion = in->getType()->isIntegerTy();
-            assert(intconversion || in->getType()->isFloatTy());
+            OPENVDB_ASSERT(intconversion || in->getType()->isFloatTy());
             llvm::Value* result = intconversion ?
                 arithmeticConversion(in, B.getInt16Ty(), B) :
                 arithmeticConversion(in, B.getHalfTy(), B);
@@ -104,9 +105,9 @@ inline FunctionGroup::UniquePtr axtrncencode()
             std::vector<llvm::Value*> outelem, inelem;
             arrayUnpack(out, outelem, B, /*load*/false);
             arrayUnpack(in, inelem, B, /*load*/true);
-            assert(outelem.size() == inelem.size());
+            OPENVDB_ASSERT(outelem.size() == inelem.size());
             const bool intconversion = inelem.front()->getType()->isIntegerTy();
-            assert(intconversion || inelem.front()->getType()->isFloatTy());
+            OPENVDB_ASSERT(intconversion || inelem.front()->getType()->isFloatTy());
 
             if (intconversion) arithmeticConversion(inelem, B.getInt16Ty(), B);
             else               arithmeticConversion(inelem, B.getHalfTy(), B);
@@ -137,7 +138,7 @@ inline FunctionGroup::UniquePtr axfxptdecode(const bool OneByte, const bool IsPo
         [IsPositionRange](const std::vector<llvm::Value*>& args,
              llvm::IRBuilder<>& B) -> llvm::Value*
     {
-        assert(args.size() == 2);
+        OPENVDB_ASSERT(args.size() == 2);
         llvm::Value* out = args[0]; // out
         llvm::Value* in = args[1]; // in
         llvm::Type* type = in->getType()->getPointerElementType();
@@ -147,7 +148,7 @@ inline FunctionGroup::UniquePtr axfxptdecode(const bool OneByte, const bool IsPo
         if (type->isIntegerTy())
         {
             in = ir_load(B, in);
-            assert(type->isIntegerTy(8) || type->isIntegerTy(16));
+            OPENVDB_ASSERT(type->isIntegerTy(8) || type->isIntegerTy(16));
             llvm::Value* s = B.CreateUIToFP(in, B.getFloatTy());
             llvm::Value* d = type->isIntegerTy(8) ?
                 LLVMType<float>::get(B.getContext(), float(std::numeric_limits<uint8_t>::max())) :
@@ -160,9 +161,9 @@ inline FunctionGroup::UniquePtr axfxptdecode(const bool OneByte, const bool IsPo
             std::vector<llvm::Value*> outelem, inelem;
             arrayUnpack(out, outelem, B, /*load*/false);
             arrayUnpack(in, inelem, B, /*load*/true);
-            assert(inelem.size() >= 3);
-            assert(outelem.size() == inelem.size());
-            assert(inelem.front()->getType()->isIntegerTy(8) || inelem.front()->getType()->isIntegerTy(16));
+            OPENVDB_ASSERT(inelem.size() >= 3);
+            OPENVDB_ASSERT(outelem.size() == inelem.size());
+            OPENVDB_ASSERT(inelem.front()->getType()->isIntegerTy(8) || inelem.front()->getType()->isIntegerTy(16));
 
             llvm::Value* d = inelem.front()->getType()->isIntegerTy(8) ?
                 LLVMType<float>::get(B.getContext(), float(std::numeric_limits<uint8_t>::max())) :
@@ -198,7 +199,7 @@ inline FunctionGroup::UniquePtr axfxptencode(const bool OneByte, const bool IsPo
         [IsPositionRange](const std::vector<llvm::Value*>& args,
              llvm::IRBuilder<>& B) -> llvm::Value*
     {
-        assert(args.size() == 2);
+        OPENVDB_ASSERT(args.size() == 2);
         llvm::LLVMContext& C = B.getContext();
         llvm::Function* base = B.GetInsertBlock()->getParent();
         llvm::Value* u = args[0]; // out
@@ -260,12 +261,12 @@ inline FunctionGroup::UniquePtr axfxptencode(const bool OneByte, const bool IsPo
         [OneByte, IsPositionRange](const std::vector<llvm::Value*>& args,
              llvm::IRBuilder<>& B) -> llvm::Value*
     {
-        assert(args.size() == 2);
+        OPENVDB_ASSERT(args.size() == 2);
         std::vector<llvm::Value*> out, in;
         arrayUnpack(args[0], out, B, /*load*/false);
         arrayUnpack(args[1], in, B, /*load*/false);
-        assert(in.size() >= 3);
-        assert(out.size() == in.size());
+        OPENVDB_ASSERT(in.size() >= 3);
+        OPENVDB_ASSERT(out.size() == in.size());
 
         auto F = axfxptencode(OneByte, IsPositionRange);
         for (size_t i = 0; i < in.size(); ++i) {
@@ -387,7 +388,7 @@ llvm::Type* Codec::findReturnTypeFromArg(const codegen::FunctionGroup* const gro
     for (const auto& F : functions) {
         types.clear();
         F->types(types, arg->getContext());
-        assert(types.size() == 2);
+        OPENVDB_ASSERT(types.size() == 2);
         if (types[1] != arg) continue;
         return types[0];
     }
diff --git a/openvdb_ax/openvdb_ax/codegen/Codecs.h b/openvdb_ax/openvdb_ax/codegen/Codecs.h
index 275f1a9342..e7ed6b5fc7 100644
--- a/openvdb_ax/openvdb_ax/codegen/Codecs.h
+++ b/openvdb_ax/openvdb_ax/codegen/Codecs.h
@@ -6,6 +6,7 @@
 
 #include <openvdb/openvdb.h>
 #include <openvdb/version.h>
+#include <openvdb/util/Assert.h>
 
 #include "openvdb_ax/ast/Tokens.h"
 #include "openvdb_ax/codegen/FunctionTypes.h"
@@ -47,11 +48,11 @@ class OPENVDB_AX_API Codec
     , mDecoder(std::move(decoder))
     , mFlag(flag) {
 #ifndef NDEBUG
-        assert(!mEncoder->list().empty());
-        assert(!mDecoder->list().empty());
-        assert(mEncoder->list().size() == mDecoder->list().size());
+        OPENVDB_ASSERT(!mEncoder->list().empty());
+        OPENVDB_ASSERT(!mDecoder->list().empty());
+        OPENVDB_ASSERT(mEncoder->list().size() == mDecoder->list().size());
         for (const auto& F : mEncoder->list()) {
-            assert(F->size() == 1 || F->size() == 2);
+            OPENVDB_ASSERT(F->size() == 1 || F->size() == 2);
         }
 #endif
     }
diff --git a/openvdb_ax/openvdb_ax/codegen/ComputeGenerator.cc b/openvdb_ax/openvdb_ax/codegen/ComputeGenerator.cc
index 07eda3dc07..8088767586 100644
--- a/openvdb_ax/openvdb_ax/codegen/ComputeGenerator.cc
+++ b/openvdb_ax/openvdb_ax/codegen/ComputeGenerator.cc
@@ -14,6 +14,8 @@
 #include "../compiler/CustomData.h"
 #include "../Exceptions.h"
 
+#include <openvdb/util/Assert.h>
+
 #include <llvm/ADT/SmallVector.h>
 #include <llvm/IR/CallingConv.h>
 #include <llvm/IR/Constants.h>
@@ -248,7 +250,7 @@ bool ComputeGenerator::visit(const ast::TernaryOperator* tern)
     if (conditionSuccess) {
         // get the condition
         trueValue = mValues.top(); mValues.pop();
-        assert(trueValue);
+        OPENVDB_ASSERT(trueValue);
 
         trueType = trueValue->getType();
         truePtr = trueType->isPointerTy();
@@ -295,7 +297,7 @@ bool ComputeGenerator::visit(const ast::TernaryOperator* tern)
 
     llvm::Value* falseValue = mValues.top(); mValues.pop();
     llvm::Type* falseType = falseValue->getType();
-    assert(trueType);
+    OPENVDB_ASSERT(trueType);
     // if both variables of same type do no casting or loading
     if (trueType != falseType) {
         // get the (contained) types of the expressions
@@ -308,7 +310,7 @@ bool ComputeGenerator::visit(const ast::TernaryOperator* tern)
         // if same contained type but one needs loading
         // can only have one pointer, one not, for scalars right now, i.e. no loaded arrays or strings
         if (trueType == falseType) {
-            assert(!(truePtr && falsePtr));
+            OPENVDB_ASSERT(!(truePtr && falsePtr));
             if (truePtr) {
                 mBuilder.SetInsertPoint(trueBranch);
                 trueValue = ir_load(mBuilder, trueValue);
@@ -326,7 +328,7 @@ bool ComputeGenerator::visit(const ast::TernaryOperator* tern)
             const bool trueScalar = (trueType->isIntegerTy() || trueType->isFloatingPointTy());
             if (trueScalar &&
                  (falseType->isIntegerTy() || falseType->isFloatingPointTy())) {
-                assert(trueType != falseType);
+                OPENVDB_ASSERT(trueType != falseType);
                 // SCALAR_SCALAR
                 returnType = typePrecedence(trueType, falseType);
                 // always load scalars here, even if they are the correct type
@@ -429,7 +431,7 @@ bool ComputeGenerator::visit(const ast::Loop* loop)
     llvm::BasicBlock* postBodyBlock = conditionBlock;
 
     const ast::tokens::LoopToken loopType = loop->loopType();
-    assert((loopType == ast::tokens::LoopToken::FOR ||
+    OPENVDB_ASSERT((loopType == ast::tokens::LoopToken::FOR ||
             loopType == ast::tokens::LoopToken::WHILE ||
             loopType == ast::tokens::LoopToken::DO) &&
             "Unsupported loop type");
@@ -512,7 +514,7 @@ bool ComputeGenerator::visit(const ast::Loop* loop)
 bool ComputeGenerator::visit(const ast::Keyword* node)
 {
     const ast::tokens::KeywordToken keyw = node->keyword();
-    assert((keyw == ast::tokens::KeywordToken::RETURN ||
+    OPENVDB_ASSERT((keyw == ast::tokens::KeywordToken::RETURN ||
             keyw == ast::tokens::KeywordToken::BREAK  ||
             keyw == ast::tokens::KeywordToken::CONTINUE) &&
             "Unsupported keyword");
@@ -541,11 +543,11 @@ bool ComputeGenerator::visit(const ast::Keyword* node)
                 breakContinue = mBreakContinueStack.top();
 
             if (keyw == ast::tokens::KeywordToken::BREAK) {
-                assert(breakContinue.first);
+                OPENVDB_ASSERT(breakContinue.first);
                 mBuilder.CreateBr(breakContinue.first);
             }
             else if (keyw == ast::tokens::KeywordToken::CONTINUE) {
-                assert(breakContinue.second);
+                OPENVDB_ASSERT(breakContinue.second);
                 mBuilder.CreateBr(breakContinue.second);
             }
         }
@@ -611,7 +613,7 @@ bool ComputeGenerator::visit(const ast::BinaryOperator* node)
 
                 mBuilder.SetInsertPoint(returnBlock);
                 if (lhsBranch) {// i.e. lhs was successful
-                    assert(rhs && lhs);
+                    OPENVDB_ASSERT(rhs && lhs);
                     llvm::PHINode* result = mBuilder.CreatePHI(LLVMType<bool>::get(mContext), 2, "binary_op");
                     result->addIncoming(lhs, lhsBranch->getParent());
                     result->addIncoming(rhs, rhsBranch->getParent());
@@ -700,7 +702,7 @@ bool ComputeGenerator::visit(const ast::UnaryOperator* node)
         type = type->getArrayElementType();
         std::vector<llvm::Value*> elements;
         arrayUnpack(value, elements, mBuilder, /*load*/true);
-        assert(elements.size() > 0);
+        OPENVDB_ASSERT(elements.size() > 0);
 
         if (type->isIntegerTy()) {
             if (token == ast::tokens::MINUS) {
@@ -744,7 +746,7 @@ bool ComputeGenerator::visit(const ast::UnaryOperator* node)
         mLog.error("value is not a scalar or vector", node);
         return false;
     }
-    assert(result);
+    OPENVDB_ASSERT(result);
     mValues.pop();
     mValues.push(result);
     return true;
@@ -761,7 +763,7 @@ bool ComputeGenerator::visit(const ast::AssignExpression* assign)
     if (assign->isCompound()) {
         llvm::Value* rhsValue = nullptr;
         if (!this->binaryExpression(rhsValue, lhs, rhs, assign->operation(), assign)) return false;
-        assert(rhsValue);
+        OPENVDB_ASSERT(rhsValue);
         rhs = rhsValue;
         rhsType = rhs->getType();
     }
@@ -794,7 +796,7 @@ bool ComputeGenerator::visit(const ast::Crement* node)
     }
     else {
         llvm::Value* crement = nullptr;
-        assert((node->increment() || node->decrement()) && "unrecognised crement operation");
+        OPENVDB_ASSERT((node->increment() || node->decrement()) && "unrecognised crement operation");
         if (node->increment())      crement = LLVMType<int32_t>::get(mContext, 1);
         else if (node->decrement()) crement = LLVMType<int32_t>::get(mContext, -1);
 
@@ -823,7 +825,7 @@ bool ComputeGenerator::visit(const ast::FunctionCall* node)
     }
     else {
         const size_t args = node->children();
-        assert(mValues.size() >= args);
+        OPENVDB_ASSERT(mValues.size() >= args);
 
         // initialize arguments. scalars are always passed by value, arrays
         // and strings always by pointer
@@ -843,7 +845,7 @@ bool ComputeGenerator::visit(const ast::FunctionCall* node)
             }
             else {
                 // arrays should never be loaded
-                assert(!type->isArrayTy() && type != LLVMType<codegen::String>::get(mContext));
+                OPENVDB_ASSERT(!type->isArrayTy() && type != LLVMType<codegen::String>::get(mContext));
                 if (type->isIntegerTy() || type->isFloatingPointTy()) {
                     /*pass by value*/
                 }
@@ -858,7 +860,7 @@ bool ComputeGenerator::visit(const ast::FunctionCall* node)
         const Function* target = function->match(inputTypes, mContext, &match);
 
         if (!target) {
-            assert(!function->list().empty()
+            OPENVDB_ASSERT(!function->list().empty()
                    && "FunctionGroup has no function declarations");
 
             std::ostringstream os;
@@ -896,7 +898,7 @@ bool ComputeGenerator::visit(const ast::FunctionCall* node)
                 result = target->call(arguments, mBuilder, /*cast=*/false);
             }
 
-            assert(result && "Function has been invoked with no valid llvm Value return");
+            OPENVDB_ASSERT(result && "Function has been invoked with no valid llvm Value return");
             mValues.push(result);
         }
     }
@@ -929,7 +931,7 @@ bool ComputeGenerator::visit(const ast::Cast* node)
         if (targetType->isIntegerTy(1)) {
             // if target is bool, perform standard boolean conversion (*not* truncation).
             value = boolComparison(value, mBuilder);
-            assert(value->getType()->isIntegerTy(1));
+            OPENVDB_ASSERT(value->getType()->isIntegerTy(1));
         }
         else {
             value = arithmeticConversion(value, targetType, mBuilder);
@@ -965,7 +967,7 @@ bool ComputeGenerator::visit(const ast::DeclareLocal* node)
         value = insertStaticAlloca(mBuilder, type);
     }
 
-    assert(value);
+    OPENVDB_ASSERT(value);
     SymbolTable* current = mSymbolTables.getOrInsert(mScopeIndex);
 
     const std::string& name = node->local()->name();
@@ -1105,7 +1107,7 @@ bool ComputeGenerator::visit(const ast::ArrayUnpack* node)
     else {
         // component0 = row, component1 = column. Index into the matrix array
         // which is layed out in row major = (component0*dim + component1)
-        assert(size == 9 || size == 16);
+        OPENVDB_ASSERT(size == 9 || size == 16);
         const int32_t dim = size == 9 ? 3 : 4;
         llvm::Value* offset =
             LLVMType<int32_t>::get(mContext, static_cast<int32_t>(dim));
@@ -1190,7 +1192,7 @@ bool ComputeGenerator::visit(const ast::Value<double>* node)
 
 bool ComputeGenerator::visit(const ast::Value<std::string>* node)
 {
-    assert(node->value().size() < static_cast<size_t>(std::numeric_limits<size_t>::max()));
+    OPENVDB_ASSERT(node->value().size() < static_cast<size_t>(std::numeric_limits<size_t>::max()));
     const FunctionGroup* axstring = this->getFunction("string::string", /*internal*/true);
     llvm::Value* loc = mBuilder.CreateGlobalStringPtr(node->value()); // char*
     llvm::Value* result = axstring->execute({loc}, mBuilder);
@@ -1203,7 +1205,7 @@ const FunctionGroup* ComputeGenerator::getFunction(const std::string &identifier
 {
     const FunctionGroup* F =
         mFunctionRegistry.getOrInsert(identifier, mOptions, allowInternal);
-    assert(F);
+    OPENVDB_ASSERT(F);
     return F;
 }
 
@@ -1229,7 +1231,7 @@ template <typename ValueType>
 typename std::enable_if<std::is_floating_point<ValueType>::value, bool>::type
 ComputeGenerator::visit(const ast::Value<ValueType>* node)
 {
-    assert(std::isinf(node->value()) || node->value() >= 0.0);
+    OPENVDB_ASSERT(std::isinf(node->value()) || node->value() >= 0.0);
     llvm::Constant* value = LLVMType<ValueType>::get(mContext, node->value());
     mValues.push(value);
     return true;
@@ -1267,7 +1269,7 @@ bool ComputeGenerator::visit(const ast::Tree*)
 
 bool ComputeGenerator::visit(const ast::Attribute*)
 {
-    assert(false && "Base ComputeGenerator attempted to generate code for an Attribute. "
+    OPENVDB_ASSERT(false && "Base ComputeGenerator attempted to generate code for an Attribute. "
         "PointComputeGenerator or VolumeComputeGenerator should be used for "
         "attribute accesses.");
     return false;
@@ -1314,7 +1316,7 @@ bool ComputeGenerator::assignExpression(llvm::Value* lhs, llvm::Value*& rhs, con
             return false;
         }
         else if (lsize == 1) {
-            assert(rsize > 1);
+            OPENVDB_ASSERT(rsize > 1);
             mLog.error("cannot assign a scalar value "
                 "from a vector or matrix. Consider using the [] operator to "
                 "get a particular element", node);
@@ -1332,7 +1334,7 @@ bool ComputeGenerator::assignExpression(llvm::Value* lhs, llvm::Value*& rhs, con
         (ltype->isFloatingPointTy() || ltype->isIntegerTy() || ltype->isArrayTy());
 
     if (componentwise) {
-        assert(rsize == lsize || (rsize == 1 || lsize == 1));
+        OPENVDB_ASSERT(rsize == lsize || (rsize == 1 || lsize == 1));
         const size_t resultsize = std::max(lsize, rsize);
 
         if (ltype != rtype) {
@@ -1357,7 +1359,7 @@ bool ComputeGenerator::assignExpression(llvm::Value* lhs, llvm::Value*& rhs, con
             if (!this->binaryExpression(newRhs, LLVMType<int32_t>::get(mContext, 0), rhs, ast::tokens::NOTEQUALS, node)) return false;
             if (!newRhs) return true;
             rhs = newRhs;
-            assert(newRhs->getType()->isIntegerTy(1));
+            OPENVDB_ASSERT(newRhs->getType()->isIntegerTy(1));
         }
 
         for (size_t i = 0; i < resultsize; ++i) {
@@ -1417,7 +1419,7 @@ void ComputeGenerator::createFreeSymbolStrings(llvm::IRBuilder<>& B)
 
     for (llvm::BasicBlock& BB : *F) {
         llvm::Instruction* TI = BB.getTerminator();
-        assert(TI);
+        OPENVDB_ASSERT(TI);
         if (llvm::isa<llvm::ReturnInst>(TI)) {
             B.SetInsertPoint(TI);
             for (auto ptr : ptrs) {
@@ -1576,9 +1578,9 @@ bool ComputeGenerator::binaryExpression(llvm::Value*& result, llvm::Value* lhs,
 
     if (componentwise)
     {
-        assert(ltype->isArrayTy() || ltype->isFloatingPointTy() || ltype->isIntegerTy());
-        assert(rtype->isArrayTy() || rtype->isFloatingPointTy() || rtype->isIntegerTy());
-        assert(rsize == lsize || (rsize == 1 || lsize == 1));
+        OPENVDB_ASSERT(ltype->isArrayTy() || ltype->isFloatingPointTy() || ltype->isIntegerTy());
+        OPENVDB_ASSERT(rtype->isArrayTy() || rtype->isFloatingPointTy() || rtype->isIntegerTy());
+        OPENVDB_ASSERT(rsize == lsize || (rsize == 1 || lsize == 1));
 
         if (op == ast::tokens::DIVIDE || op == ast::tokens::MODULO) {
             if (llvm::Constant* c = llvm::dyn_cast<llvm::Constant>(rhs)) {
@@ -1641,9 +1643,9 @@ bool ComputeGenerator::binaryExpression(llvm::Value*& result, llvm::Value* lhs,
 
         if (op == ast::tokens::MODULO) {
             const FunctionGroup* mod = this->getFunction("floormod");
-            assert(mod);
+            OPENVDB_ASSERT(mod);
             target = mod->match({opprec,opprec}, mContext);
-            assert(target);
+            OPENVDB_ASSERT(target);
         }
 
         // perform op
@@ -1661,7 +1663,7 @@ bool ComputeGenerator::binaryExpression(llvm::Value*& result, llvm::Value* lhs,
                 const ast::tokens::OperatorToken reductionOp =
                     op == ast::tokens::EQUALSEQUALS ? ast::tokens::AND : ast::tokens::OR;
                 result = elements.front();
-                assert(result->getType() == LLVMType<bool>::get(mContext));
+                OPENVDB_ASSERT(result->getType() == LLVMType<bool>::get(mContext));
                 for (size_t i = 1; i < resultsize; ++i) {
                     result = binaryOperator(result, elements[i], reductionOp, mBuilder);
                 }
diff --git a/openvdb_ax/openvdb_ax/codegen/ConstantFolding.h b/openvdb_ax/openvdb_ax/codegen/ConstantFolding.h
index a0242ba06d..f5af7560da 100644
--- a/openvdb_ax/openvdb_ax/codegen/ConstantFolding.h
+++ b/openvdb_ax/openvdb_ax/codegen/ConstantFolding.h
@@ -14,6 +14,7 @@
 #include "Types.h"
 
 #include <openvdb/version.h>
+#include <openvdb/util/Assert.h>
 
 #include <llvm/IR/Constants.h>
 
@@ -57,18 +58,18 @@ struct ConstantFolder
          llvm::LLVMContext& C,
          Tys&&... ts)
     {
-        assert(I-1 < args.size());
+        OPENVDB_ASSERT(I-1 < args.size());
         llvm::Constant* constant = args[I-1];
         const llvm::Type* type = constant->getType();
         if (type->isIntegerTy()) {
-            assert(llvm::isa<llvm::ConstantInt>(constant));
+            OPENVDB_ASSERT(llvm::isa<llvm::ConstantInt>(constant));
             llvm::ConstantInt* cint =
                 llvm::cast<llvm::ConstantInt>(constant);
             const uint64_t val = cint->getLimitedValue();
             return call<uint64_t, ArgumentValueType>(args, function, C, val, ts...);
         }
         else if (type->isFloatTy() || type->isDoubleTy()) {
-            assert(llvm::isa<llvm::ConstantFP>(constant));
+            OPENVDB_ASSERT(llvm::isa<llvm::ConstantFP>(constant));
             llvm::ConstantFP* cfp =
                 llvm::cast<llvm::ConstantFP>(constant);
             const llvm::APFloat& apf = cfp->getValueAPF();
diff --git a/openvdb_ax/openvdb_ax/codegen/FunctionTypes.cc b/openvdb_ax/openvdb_ax/codegen/FunctionTypes.cc
index 751dc13000..4ce9f27e4d 100644
--- a/openvdb_ax/openvdb_ax/codegen/FunctionTypes.cc
+++ b/openvdb_ax/openvdb_ax/codegen/FunctionTypes.cc
@@ -10,6 +10,7 @@
 #include "../Exceptions.h"
 
 #include <openvdb/util/Name.h>
+#include <openvdb/util/Assert.h>
 
 #include <llvm/IR/Function.h>
 #include <llvm/IR/LLVMContext.h>
@@ -139,11 +140,11 @@ Function::call(const std::vector<llvm::Value*>& args,
      const bool cast) const
 {
     llvm::BasicBlock* block = B.GetInsertBlock();
-    assert(block);
+    OPENVDB_ASSERT(block);
     llvm::Function* currentFunction = block->getParent();
-    assert(currentFunction);
+    OPENVDB_ASSERT(currentFunction);
     llvm::Module* M = currentFunction->getParent();
-    assert(M);
+    OPENVDB_ASSERT(M);
     llvm::Function* function = this->create(B.getContext(), M);
     std::vector<llvm::Value*> inputs(args);
     if (cast) {
@@ -163,7 +164,7 @@ Function::match(const std::vector<llvm::Type*>& inputs, llvm::LLVMContext& C) co
     if (inputs.size() != this->size()) return None;
     if (inputs.empty() && this->size() == 0) return Explicit;
 
-    assert(!inputs.empty());
+    OPENVDB_ASSERT(!inputs.empty());
     //llvm::LLVMContext& C = inputs.front()->getContext();
 
     std::vector<llvm::Type*> signature;
@@ -296,7 +297,7 @@ IRFunctionBase::create(llvm::LLVMContext& C, llvm::Module* M) const
     if (this->hasEmbedIR()) return nullptr;
 
     llvm::Function* F = this->Function::create(C, M);
-    assert(F);
+    OPENVDB_ASSERT(F);
     // return if the function has already been generated or if no
     // module has been provided (just the function prototype requested)
     if (!F->empty() || !M) return F;
@@ -331,7 +332,7 @@ IRFunctionBase::create(llvm::LLVMContext& C, llvm::Module* M) const
         lastInstruction = B.CreateRetVoid();
     }
     else if (!llvm::isa<llvm::ReturnInst>(lastInstruction)) {
-        assert(lastInstruction);
+        OPENVDB_ASSERT(lastInstruction);
         if (lastInstruction->getType()->isVoidTy()) {
             lastInstruction = B.CreateRetVoid();
         }
@@ -339,8 +340,8 @@ IRFunctionBase::create(llvm::LLVMContext& C, llvm::Module* M) const
             lastInstruction = B.CreateRet(lastInstruction);
         }
     }
-    assert(lastInstruction);
-    assert(llvm::isa<llvm::ReturnInst>(lastInstruction));
+    OPENVDB_ASSERT(lastInstruction);
+    OPENVDB_ASSERT(llvm::isa<llvm::ReturnInst>(lastInstruction));
 
     // pull out the ret type - is null if void
     llvm::Value* rvalue =
@@ -419,14 +420,14 @@ FunctionGroup::execute(const std::vector<llvm::Value*>& args,
 
     Function::SignatureMatch match;
     const Function* target = this->match(inputTypes, B.getContext(), &match);
-    assert(target);
+    OPENVDB_ASSERT(target);
     llvm::Value* result =
         target->call(args, B, /*cast=*/match == Function::SignatureMatch::Implicit);
 
 #ifndef NDEBUG
     std::vector<llvm::Type*> unused;
     llvm::Type* ret = target->types(unused, B.getContext());
-    assert(result || ret->isVoidTy());
+    OPENVDB_ASSERT(result || ret->isVoidTy());
 #endif
     return result;
 }
@@ -448,7 +449,7 @@ FunctionGroup::execute(const std::vector<llvm::Value*>& args,
 #ifndef NDEBUG
     std::vector<llvm::Type*> unused;
     llvm::Type* ret = target->types(unused, B.getContext());
-    assert(result || ret->isVoidTy());
+    OPENVDB_ASSERT(result || ret->isVoidTy());
 #endif
 
     return target;
diff --git a/openvdb_ax/openvdb_ax/codegen/FunctionTypes.h b/openvdb_ax/openvdb_ax/codegen/FunctionTypes.h
index e4ce68e4ba..aa8a5a7f61 100644
--- a/openvdb_ax/openvdb_ax/codegen/FunctionTypes.h
+++ b/openvdb_ax/openvdb_ax/codegen/FunctionTypes.h
@@ -71,6 +71,7 @@
 #include "ConstantFolding.h"
 
 #include <openvdb/version.h>
+#include <openvdb/util/Assert.h>
 
 #include <llvm/IR/Constants.h>
 #include <llvm/IR/IRBuilder.h>
@@ -274,7 +275,7 @@ struct OPENVDB_AX_API Function
         , mNames()
         , mDeps() {
             // symbol must be a valid string
-            assert(!symbol.empty());
+            OPENVDB_ASSERT(!symbol.empty());
         }
 
     virtual ~Function() = default;
diff --git a/openvdb_ax/openvdb_ax/codegen/PointComputeGenerator.cc b/openvdb_ax/openvdb_ax/codegen/PointComputeGenerator.cc
index e38f426bfc..87e07f983b 100644
--- a/openvdb_ax/openvdb_ax/codegen/PointComputeGenerator.cc
+++ b/openvdb_ax/openvdb_ax/codegen/PointComputeGenerator.cc
@@ -14,6 +14,8 @@
 #include "openvdb_ax/Exceptions.h"
 #include "openvdb_ax/ast/Scanners.h"
 
+#include <openvdb/util/Assert.h>
+
 #include <llvm/ADT/SmallVector.h>
 #include <llvm/IR/BasicBlock.h>
 #include <llvm/IR/CallingConv.h>
@@ -144,15 +146,15 @@ inline void PointComputeGenerator::computePKBR(const AttributeRegistry&)
         [&](const std::vector<llvm::Value*>& args,
             llvm::IRBuilder<>& B) -> llvm::Value*
     {
-        assert(args.size() == 12);
+        OPENVDB_ASSERT(args.size() == 12);
         llvm::Value* vbuff = args[2]; //extractArgument(rangeFunction, "value_buffer");
         llvm::Value* abuff = args[3]; //extractArgument(rangeFunction, "active_buffer");
         llvm::Value* buffSize = args[4]; //extractArgument(rangeFunction, "buffer_size");
         llvm::Value* mode = args[5]; //extractArgument(rangeFunction, "mode");
-        assert(buffSize);
-        assert(vbuff);
-        assert(abuff);
-        assert(mode);
+        OPENVDB_ASSERT(buffSize);
+        OPENVDB_ASSERT(vbuff);
+        OPENVDB_ASSERT(abuff);
+        OPENVDB_ASSERT(mode);
 
         llvm::Function* base = B.GetInsertBlock()->getParent();
         llvm::LLVMContext& C = B.getContext();
@@ -327,8 +329,8 @@ decode(llvm::Value* buffer,
         buffer = B.CreatePointerCast(buffer, type->getPointerTo());
         return ir_gep(B, buffer, pid);
     }
-    assert(!codecs->empty());
-    assert(store);
+    OPENVDB_ASSERT(!codecs->empty());
+    OPENVDB_ASSERT(store);
 
     llvm::Function* self = B.GetInsertBlock()->getParent();
     llvm::BasicBlock* post = llvm::BasicBlock::Create(C, "k1.get_buffer.decode", self);
@@ -351,13 +353,13 @@ decode(llvm::Value* buffer,
             // the input value and decode the value.
             const FunctionGroup* const F = codec->decoder();
             llvm::Type* encodedType = codec->decodedToEncoded(decodedType, C);
-            assert(encodedType);
+            OPENVDB_ASSERT(encodedType);
             encodedType = encodedType->getPointerTo();
 
             // guranteed to be castable
             llvm::Value* typedBuffer = B.CreatePointerCast(buffer, encodedType);
             llvm::Value* encoded = ir_gep(B, typedBuffer, pid);
-            assert(F->match({store->getType(), encoded->getType()}, C));
+            OPENVDB_ASSERT(F->match({store->getType(), encoded->getType()}, C));
             F->execute({store, encoded}, B);
             B.CreateBr(post);
         }
@@ -403,7 +405,7 @@ encode(llvm::Value* in,
         B.CreateStore(ir_load(B, in), ir_gep(B, buffer, pid));
         return;
     }
-    assert(!codecs->empty());
+    OPENVDB_ASSERT(!codecs->empty());
 
     llvm::Function* self = B.GetInsertBlock()->getParent();
     llvm::BasicBlock* post = llvm::BasicBlock::Create(C, "k1.set_buffer.encode", self);
@@ -424,11 +426,11 @@ encode(llvm::Value* in,
         {
             const FunctionGroup* const F = codec->encoder();
             llvm::Type* encodedType = codec->decodedToEncoded(decodedType, C);
-            assert(encodedType);
+            OPENVDB_ASSERT(encodedType);
             encodedType = encodedType->getPointerTo();
             llvm::Value* typedBuffer = B.CreatePointerCast(buffer, encodedType);
             llvm::Value* loc = ir_gep(B, typedBuffer, pid);
-            assert(F->match({loc->getType(),in->getType()}, C));
+            OPENVDB_ASSERT(F->match({loc->getType(),in->getType()}, C));
             F->execute({loc, in}, B);
             B.CreateBr(post);
         }
@@ -455,15 +457,15 @@ inline void PointComputeGenerator::computePKB(const AttributeRegistry& registry)
         [&](const std::vector<llvm::Value*>& args,
             llvm::IRBuilder<>& B) -> llvm::Value*
     {
-        assert(args.size() == 11);
+        OPENVDB_ASSERT(args.size() == 11);
         auto& C = B.getContext();
         llvm::Function* self = B.GetInsertBlock()->getParent();
         llvm::Value* pindex = extractArgument(self, "point_index");
         llvm::Value* flags = extractArgument(self, "flags");
         llvm::Value* buffers = extractArgument(self, "buffers");
-        assert(buffers);
-        assert(pindex);
-        assert(flags);
+        OPENVDB_ASSERT(buffers);
+        OPENVDB_ASSERT(pindex);
+        OPENVDB_ASSERT(flags);
 
         // create array of void*. each pointer will encode an address to a stored typed value
         llvm::Type* locType = llvm::ArrayType::get(LLVMType<void*>::get(C), registry.data().size()); // [SIZE x i8*]
@@ -479,7 +481,7 @@ inline void PointComputeGenerator::computePKB(const AttributeRegistry& registry)
             decodedPtrs = B.CreatePointerCast(decodedPtrs, type->getPointerTo()->getPointerTo()); // ValueType**
 
             llvm::Value* index = mModule.getGlobalVariable(token);
-            assert(index);
+            OPENVDB_ASSERT(index);
             index = ir_load(B, index);
             llvm::Value* buffer = ir_gep(B, buffers, index);
             buffer = ir_load(B, buffer); // void** = void*
@@ -548,7 +550,7 @@ inline void PointComputeGenerator::computePKB(const AttributeRegistry& registry)
             store = B.CreatePointerCast(store, type->getPointerTo()); // ValueType*
 
             llvm::Value* index = mModule.getGlobalVariable(token);
-            assert(index);
+            OPENVDB_ASSERT(index);
             index = ir_load(B, index);
             llvm::Value* flag = ir_load(B, ir_gep(B, flags, index));
 
@@ -613,11 +615,11 @@ inline void PointComputeGenerator::computePKAA(const AttributeRegistry& registry
         // insert the attribute into the map of global variables and get a unique global representing
         // the location which will hold the attribute handle offset.
         llvm::Value* index = M->getGlobalVariable(token);
-        assert(index);
+        OPENVDB_ASSERT(index);
         index = ir_load(B, index);
 
         llvm::Value* arrays = extractArgument(self, "attribute_arrays");
-        assert(arrays);
+        OPENVDB_ASSERT(arrays);
         llvm::Value* array = ir_gep(B, arrays, index);
         array = ir_load(B, array); // void** = void*
 
@@ -654,11 +656,11 @@ inline void PointComputeGenerator::computePKAA(const AttributeRegistry& registry
         // insert the attribute into the map of global variables and get a unique global representing
         // the location which will hold the attribute handle offset.
         llvm::Value* index = M->getGlobalVariable(token);
-        assert(index);
+        OPENVDB_ASSERT(index);
         index = ir_load(B, index);
 
         llvm::Value* arrays = extractArgument(self, "attribute_arrays");
-        assert(arrays);
+        OPENVDB_ASSERT(arrays);
         llvm::Value* array = ir_gep(B, arrays, index);
         array = ir_load(B, array); // void** = void*
 
@@ -680,7 +682,7 @@ inline void PointComputeGenerator::computePKAA(const AttributeRegistry& registry
 
         if (usingString) {
             llvm::Value* leafdata = extractArgument(self, "leaf_data");
-            assert(leafdata);
+            OPENVDB_ASSERT(leafdata);
             args.emplace_back(leafdata);
         }
 
@@ -694,7 +696,7 @@ inline void PointComputeGenerator::computePKAA(const AttributeRegistry& registry
         [&](const std::vector<llvm::Value*>& args,
             llvm::IRBuilder<>& B) -> llvm::Value*
     {
-        assert(args.size() == 11);
+        OPENVDB_ASSERT(args.size() == 11);
         auto& C = B.getContext();
         llvm::Function* self = B.GetInsertBlock()->getParent();
         llvm::Value* pindex = extractArgument(self, "point_index");
@@ -709,7 +711,7 @@ inline void PointComputeGenerator::computePKAA(const AttributeRegistry& registry
         size_t i = 0;
         for (const AttributeRegistry::AccessData& access : registry.data()) {
             llvm::Value* value = insertStaticAlloca(B, llvmTypeFromToken(access.type(), C));
-            assert(llvm::cast<llvm::AllocaInst>(value)->isStaticAlloca());
+            OPENVDB_ASSERT(llvm::cast<llvm::AllocaInst>(value)->isStaticAlloca());
             table.insert(access.tokenname(), value);
 
             // store the allocated ptr in the array of void*
@@ -740,7 +742,7 @@ inline void PointComputeGenerator::computePKAA(const AttributeRegistry& registry
             const std::string token = data.tokenname();
             llvm::Value* value = table.get(token);
             // // Expected to be used more than one (i.e. should never be zero)
-            // assert(value->hasNUsesOrMore(1));
+            // OPENVDB_ASSERT(value->hasNUsesOrMore(1));
             // // Check to see if this value is still being used - it may have
             // // been cleaned up due to returns. If there's only one use, it's
             // // the original get of this attribute.
@@ -862,7 +864,7 @@ bool PointComputeGenerator::visit(const ast::Attribute* node)
     llvm::Value* index = mModule.getGlobalVariable(node->tokenname());
     llvm::Type* type = llvmTypeFromToken(node->type(), mContext);
 
-    assert(index);
+    OPENVDB_ASSERT(index);
     // index into the void* array of handles and load the value.
     index = ir_load(mBuilder, index);
     llvm::Value* value = extractArgument(mFunction, "values"); // void**
diff --git a/openvdb_ax/openvdb_ax/codegen/PointFunctions.cc b/openvdb_ax/openvdb_ax/codegen/PointFunctions.cc
index c6aa6af0e5..a19af9d491 100644
--- a/openvdb_ax/openvdb_ax/codegen/PointFunctions.cc
+++ b/openvdb_ax/openvdb_ax/codegen/PointFunctions.cc
@@ -24,6 +24,7 @@
 
 #include <openvdb/openvdb.h>
 #include <openvdb/points/PointDataGrid.h>
+#include <openvdb/util/Assert.h>
 
 #include <unordered_map>
 
@@ -78,8 +79,8 @@ inline FunctionGroup::UniquePtr ax_ingroup(const FunctionOptions& op)
            const void* const leafDataPtr,
            const void* const data) -> bool
     {
-        assert(name);
-        assert(index < static_cast<uint64_t>(std::numeric_limits<openvdb::Index>::max()));
+        OPENVDB_ASSERT(name);
+        OPENVDB_ASSERT(index < static_cast<uint64_t>(std::numeric_limits<openvdb::Index>::max()));
 
         if (name->size() == 0) return false;
         if (!groupHandles) return false;
@@ -134,10 +135,10 @@ inline FunctionGroup::UniquePtr axingroup(const FunctionOptions& op)
         llvm::Value* group_handles = extractArgument(compute, "group_handles");
         llvm::Value* leaf_data = extractArgument(compute, "leaf_data");
         llvm::Value* attribute_set = extractArgument(compute, "attribute_set");
-        assert(point_index);
-        assert(group_handles);
-        assert(leaf_data);
-        assert(attribute_set);
+        OPENVDB_ASSERT(point_index);
+        OPENVDB_ASSERT(group_handles);
+        OPENVDB_ASSERT(leaf_data);
+        OPENVDB_ASSERT(attribute_set);
 
         std::vector<llvm::Value*> input(args);
         input.emplace_back(point_index);
@@ -169,7 +170,7 @@ inline FunctionGroup::UniquePtr axeditgroup(const FunctionOptions& op)
            const void* const data,
            const bool flag)
     {
-        assert(name);
+        OPENVDB_ASSERT(name);
         if (name->size() == 0) return;
 
         // Get the group handle out of the pre-existing container of handles if they
@@ -189,7 +190,7 @@ inline FunctionGroup::UniquePtr axeditgroup(const FunctionOptions& op)
             // the set of new data thats being added
             if (!flag && !leafData->hasGroup(nameStr)) return;
             handle = leafData->getOrInsert(nameStr);
-            assert(handle);
+            OPENVDB_ASSERT(handle);
         }
 
         // set the group membership
@@ -248,10 +249,10 @@ inline FunctionGroup::UniquePtr axaddtogroup(const FunctionOptions& op)
         llvm::Value* group_handles = extractArgument(compute, "group_handles");
         llvm::Value* leaf_data = extractArgument(compute, "leaf_data");
         llvm::Value* attribute_set = extractArgument(compute, "attribute_set");
-        assert(point_index);
-        assert(group_handles);
-        assert(leaf_data);
-        assert(attribute_set);
+        OPENVDB_ASSERT(point_index);
+        OPENVDB_ASSERT(group_handles);
+        OPENVDB_ASSERT(leaf_data);
+        OPENVDB_ASSERT(attribute_set);
 
         std::vector<llvm::Value*> input(args);
         input.emplace_back(point_index);
@@ -289,10 +290,10 @@ inline FunctionGroup::UniquePtr axremovefromgroup(const FunctionOptions& op)
         llvm::Value* group_handles = extractArgument(compute, "group_handles");
         llvm::Value* leaf_data = extractArgument(compute, "leaf_data");
         llvm::Value* attribute_set = extractArgument(compute, "attribute_set");
-        assert(point_index);
-        assert(group_handles);
-        assert(leaf_data);
-        assert(attribute_set);
+        OPENVDB_ASSERT(point_index);
+        OPENVDB_ASSERT(group_handles);
+        OPENVDB_ASSERT(leaf_data);
+        OPENVDB_ASSERT(attribute_set);
 
         std::vector<llvm::Value*> input(args);
         input.emplace_back(point_index);
@@ -350,9 +351,9 @@ inline FunctionGroup::UniquePtr axsetattribute(const FunctionOptions& op)
                 <decltype(value)>::type>::type;
         using AttributeHandleType = openvdb::points::AttributeWriteHandle<ValueType>;
 
-        assert(attributeHandle);
-        assert(value);
-        assert(index < static_cast<uint64_t>(std::numeric_limits<openvdb::Index>::max()));
+        OPENVDB_ASSERT(attributeHandle);
+        OPENVDB_ASSERT(value);
+        OPENVDB_ASSERT(index < static_cast<uint64_t>(std::numeric_limits<openvdb::Index>::max()));
 
         AttributeHandleType* handle = static_cast<AttributeHandleType*>(attributeHandle);
         handle->set(static_cast<openvdb::Index>(index), *value);
@@ -366,10 +367,10 @@ inline FunctionGroup::UniquePtr axsetattribute(const FunctionOptions& op)
     {
         using AttributeHandleType = openvdb::points::StringAttributeWriteHandle;
 
-        assert(attributeHandle);
-        assert(value);
-        assert(leafDataPtr);
-        assert(index < static_cast<uint64_t>(std::numeric_limits<openvdb::Index>::max()));
+        OPENVDB_ASSERT(attributeHandle);
+        OPENVDB_ASSERT(value);
+        OPENVDB_ASSERT(leafDataPtr);
+        OPENVDB_ASSERT(index < static_cast<uint64_t>(std::numeric_limits<openvdb::Index>::max()));
 
         const std::string s = value->str();
         AttributeHandleType* const handle =
@@ -468,9 +469,9 @@ inline FunctionGroup::UniquePtr axgetattribute(const FunctionOptions& op)
         // only being read!
         using AttributeHandleType = openvdb::points::AttributeHandle<ValueType>;
 
-        assert(value);
-        assert(attributeHandle);
-        assert(index < static_cast<uint64_t>(std::numeric_limits<openvdb::Index>::max()));
+        OPENVDB_ASSERT(value);
+        OPENVDB_ASSERT(attributeHandle);
+        OPENVDB_ASSERT(index < static_cast<uint64_t>(std::numeric_limits<openvdb::Index>::max()));
 
         AttributeHandleType* handle = static_cast<AttributeHandleType*>(attributeHandle);
         (*value) = handle->get(static_cast<openvdb::Index>(index));
@@ -484,10 +485,10 @@ inline FunctionGroup::UniquePtr axgetattribute(const FunctionOptions& op)
     {
         using AttributeHandleType = openvdb::points::StringAttributeHandle;
 
-        assert(value);
-        assert(attributeHandle);
-        assert(leafDataPtr);
-        assert(index < static_cast<uint64_t>(std::numeric_limits<openvdb::Index>::max()));
+        OPENVDB_ASSERT(value);
+        OPENVDB_ASSERT(attributeHandle);
+        OPENVDB_ASSERT(leafDataPtr);
+        OPENVDB_ASSERT(index < static_cast<uint64_t>(std::numeric_limits<openvdb::Index>::max()));
 
         AttributeHandleType* const handle =
             static_cast<AttributeHandleType*>(attributeHandle);
diff --git a/openvdb_ax/openvdb_ax/codegen/PointLeafLocalData.h b/openvdb_ax/openvdb_ax/codegen/PointLeafLocalData.h
index ac1ef37032..573916805c 100644
--- a/openvdb_ax/openvdb_ax/codegen/PointLeafLocalData.h
+++ b/openvdb_ax/openvdb_ax/codegen/PointLeafLocalData.h
@@ -17,6 +17,7 @@
 #include <openvdb/points/PointAttribute.h>
 #include <openvdb/points/PointDataGrid.h>
 #include <openvdb/points/PointGroup.h>
+#include <openvdb/util/Assert.h>
 
 namespace openvdb {
 OPENVDB_USE_VERSION_NAMESPACE
@@ -91,13 +92,13 @@ struct PointLeafLocalData
 #endif
 
         if (mArrays.empty() || mOffset == maxGroupsInArray) {
-            assert(mPointCount < static_cast<size_t>(std::numeric_limits<openvdb::Index>::max()));
+            OPENVDB_ASSERT(mPointCount < static_cast<size_t>(std::numeric_limits<openvdb::Index>::max()));
             mArrays.emplace_back(new GroupArrayT(static_cast<openvdb::Index>(mPointCount)));
             mOffset = 0;
         }
 
         GroupArrayT* array = mArrays.back().get();
-        assert(array);
+        OPENVDB_ASSERT(array);
 
         std::unique_ptr<GroupHandleT>& handle = mHandles[name];
         handle.reset(new GroupHandleT(*array, mOffset++));
diff --git a/openvdb_ax/openvdb_ax/codegen/StandardFunctions.cc b/openvdb_ax/openvdb_ax/codegen/StandardFunctions.cc
index a9c09fd957..df9504a485 100644
--- a/openvdb_ax/openvdb_ax/codegen/StandardFunctions.cc
+++ b/openvdb_ax/openvdb_ax/codegen/StandardFunctions.cc
@@ -20,6 +20,8 @@
 #include "../compiler/CompilerOptions.h"
 #include "../compiler/CustomData.h"
 
+#include <openvdb/util/Assert.h>
+
 #include <tbb/enumerable_thread_specific.h>
 
 #include <llvm/IR/Intrinsics.h>
@@ -88,7 +90,7 @@ struct SimplexNoise
             llvm::Function* function =                                                      \
                 llvm::Intrinsic::getDeclaration(M,                                          \
                     llvm::Intrinsic::Identifier, args[0]->getType());                       \
-            assert(function);                                                               \
+            OPENVDB_ASSERT(function);                                                               \
             return B.CreateCall(function, args);                                            \
         };                                                                                  \
                                                                                             \
@@ -146,7 +148,7 @@ inline FunctionGroup::UniquePtr axmalloc(const FunctionOptions& op)
                 args[0], // size
                 nullptr,
                 nullptr);
-        assert(inst);
+        OPENVDB_ASSERT(inst);
         B.Insert(inst);
         return inst;
     };
@@ -169,7 +171,7 @@ inline FunctionGroup::UniquePtr axfree(const FunctionOptions& op)
     {
         llvm::BasicBlock* BB = B.GetInsertBlock();
         llvm::Instruction* inst = llvm::CallInst::CreateFree(args[0], BB);
-        assert(inst);
+        OPENVDB_ASSERT(inst);
         B.Insert(inst);
         return nullptr;
     };
@@ -355,9 +357,9 @@ inline FunctionGroup::UniquePtr axcross(const FunctionOptions& op)
         arrayUnpack(args[0], ptrs, B, /*load*/false);
         arrayUnpack(args[1], left, B, /*load*/true);
         arrayUnpack(args[2], right, B, /*load*/true);
-        assert(ptrs.size() == 3);
-        assert(left.size() == 3);
-        assert(right.size() == 3);
+        OPENVDB_ASSERT(ptrs.size() == 3);
+        OPENVDB_ASSERT(left.size() == 3);
+        OPENVDB_ASSERT(right.size() == 3);
 
         std::vector<llvm::Value*> results(3);
 
@@ -413,7 +415,7 @@ inline FunctionGroup::UniquePtr axlengthsq(const FunctionOptions& op)
     {
         std::vector<llvm::Value*> elements;
         arrayUnpack(args[0], elements, B, /*load*/true);
-        assert(elements.size() >= 2);
+        OPENVDB_ASSERT(elements.size() >= 2);
 
         llvm::Value* v1 = binaryOperator(elements[0], elements[0], ast::tokens::MULTIPLY, B);
         llvm::Value* v2 = binaryOperator(elements[1], elements[1], ast::tokens::MULTIPLY, B);
@@ -512,8 +514,8 @@ inline FunctionGroup::UniquePtr axnormalize(const FunctionOptions& op)
         std::vector<llvm::Value*> ptrs, elements;
         arrayUnpack(args[0], ptrs, B, /*load*/false);
         arrayUnpack(args[1], elements, B, /*load*/true);
-        assert(ptrs.size() == 3 || ptrs.size() == 4);
-        assert(elements.size() == 3 || elements.size() == 4);
+        OPENVDB_ASSERT(ptrs.size() == 3 || ptrs.size() == 4);
+        OPENVDB_ASSERT(elements.size() == 3 || elements.size() == 4);
 
         if (elements[0]->getType()->isIntegerTy()) {
            arithmeticConversion(elements, LLVMType<double>::get(B.getContext()), B);
@@ -575,7 +577,7 @@ inline FunctionGroup::UniquePtr axlerp(const FunctionOptions& op)
         [](const std::vector<llvm::Value*>& args,
            llvm::IRBuilder<>& B) -> llvm::Value*
     {
-        assert(args.size() == 3);
+        OPENVDB_ASSERT(args.size() == 3);
         llvm::Value* a = args[0], *b = args[1], *t = args[2];
         llvm::Value* zero = llvm::ConstantFP::get(a->getType(), 0.0);
         llvm::Value* one = llvm::ConstantFP::get(a->getType(), 1.0);
@@ -1022,7 +1024,7 @@ inline FunctionGroup::UniquePtr axsign(const FunctionOptions& op)
             llvm::IRBuilder<>& B) -> llvm::Value*
     {
         // int r = (T(0) < val) - (val < T(0));
-        assert(args.size() == 1);
+        OPENVDB_ASSERT(args.size() == 1);
         llvm::Value* arg = args.front();
         llvm::Type* type = arg->getType();
         llvm::Value* zero;
@@ -1030,7 +1032,7 @@ inline FunctionGroup::UniquePtr axsign(const FunctionOptions& op)
             zero = llvm::ConstantInt::get(type, static_cast<uint64_t>(0), /*signed*/true);
         }
         else {
-            assert(type->isFloatingPointTy());
+            OPENVDB_ASSERT(type->isFloatingPointTy());
             zero = llvm::ConstantFP::get(type, static_cast<double>(0.0));
         }
 
@@ -1085,7 +1087,7 @@ inline FunctionGroup::UniquePtr axtruncatemod(const FunctionOptions& op)
         [](const std::vector<llvm::Value*>& args,
            llvm::IRBuilder<>& B) -> llvm::Value*
     {
-        assert(args.size() == 2);
+        OPENVDB_ASSERT(args.size() == 2);
         return binaryOperator(args[0], args[1], ast::tokens::MODULO, B);
     };
 
@@ -1129,7 +1131,7 @@ inline FunctionGroup::UniquePtr axfloormod(const FunctionOptions& op)
         [](const std::vector<llvm::Value*>& args,
            llvm::IRBuilder<>& B) -> llvm::Value*
     {
-        assert(args.size() == 2);
+        OPENVDB_ASSERT(args.size() == 2);
         llvm::Value* D = args[0];
         llvm::Value* d = args[1];
         // tmod
@@ -1194,7 +1196,7 @@ inline FunctionGroup::UniquePtr axeuclideanmod(const FunctionOptions& op)
         [](const std::vector<llvm::Value*>& args,
            llvm::IRBuilder<>& B) -> llvm::Value*
     {
-        assert(args.size() == 2);
+        OPENVDB_ASSERT(args.size() == 2);
         llvm::Value* D = args[0], *d = args[1];
         llvm::Value* r = binaryOperator(D, d, ast::tokens::MODULO, B); // tmod
 
@@ -1234,7 +1236,7 @@ inline FunctionGroup::UniquePtr axisfinite(const FunctionOptions& op)
         [op](const std::vector<llvm::Value*>& args,
            llvm::IRBuilder<>& B) -> llvm::Value*
     {
-        assert(args.size() == 1);
+        OPENVDB_ASSERT(args.size() == 1);
         llvm::Value* arg = args[0];
         llvm::Type* etype = arg->getType();
         if (etype->isPointerTy()) {
@@ -1248,7 +1250,7 @@ inline FunctionGroup::UniquePtr axisfinite(const FunctionOptions& op)
             inf = LLVMType<float>::get(B.getContext(), apinf.convertToFloat());
         }
         else {
-            assert(etype->isDoubleTy());
+            OPENVDB_ASSERT(etype->isDoubleTy());
             const llvm::APFloat apinf =
                 llvm::APFloat::getInf(llvm::APFloatBase::IEEEdouble());
             inf = LLVMType<double>::get(B.getContext(), apinf.convertToDouble());
@@ -1310,7 +1312,7 @@ inline FunctionGroup::UniquePtr axisinf(const FunctionOptions& op)
         [op](const std::vector<llvm::Value*>& args,
            llvm::IRBuilder<>& B) -> llvm::Value*
     {
-        assert(args.size() == 1);
+        OPENVDB_ASSERT(args.size() == 1);
         llvm::Value* arg = args[0];
         llvm::Type* etype = arg->getType();
         if (etype->isPointerTy()) {
@@ -1324,7 +1326,7 @@ inline FunctionGroup::UniquePtr axisinf(const FunctionOptions& op)
             inf = LLVMType<float>::get(B.getContext(), apinf.convertToFloat());
         }
         else {
-            assert(etype->isDoubleTy());
+            OPENVDB_ASSERT(etype->isDoubleTy());
             const llvm::APFloat apinf =
                 llvm::APFloat::getInf(llvm::APFloatBase::IEEEdouble());
             inf = LLVMType<double>::get(B.getContext(), apinf.convertToDouble());
@@ -1388,7 +1390,7 @@ inline FunctionGroup::UniquePtr axisnan(const FunctionOptions& op)
     {
         // uno (unordered) comparison with self
         // https://llvm.org/docs/LangRef.html#fcmp-instruction
-        assert(args.size() == 1);
+        OPENVDB_ASSERT(args.size() == 1);
         llvm::Value* arg = args[0];
         if (!arg->getType()->isPointerTy()) {
             return B.CreateFCmpUNO(arg, arg);
@@ -1450,7 +1452,7 @@ inline FunctionGroup::UniquePtr axdeterminant(const FunctionOptions& op)
     {
         std::vector<llvm::Value*> m1;
         arrayUnpack(args[0], m1, B, /*load*/true);
-        assert(m1.size() == 9);
+        OPENVDB_ASSERT(m1.size() == 9);
 
         llvm::Value* e1 = binaryOperator(m1[4], m1[8], ast::tokens::MULTIPLY, B);
         llvm::Value* e2 = binaryOperator(m1[5], m1[7], ast::tokens::MULTIPLY, B);
@@ -1480,7 +1482,7 @@ inline FunctionGroup::UniquePtr axdeterminant(const FunctionOptions& op)
     {
         std::vector<llvm::Value*> m1;
         arrayUnpack(args[0], m1, B, /*load*/true);
-        assert(m1.size() == 16);
+        OPENVDB_ASSERT(m1.size() == 16);
 
         // @note  Okay to alloca here as long as embed IR is false
         llvm::Value* subMat = B.CreateAlloca(llvm::ArrayType::get(m1.front()->getType(), 9));
@@ -1547,7 +1549,7 @@ inline FunctionGroup::UniquePtr axdiag(const FunctionOptions& op)
         if (size == 3 || size == 4) {
             //vector - convert to diagonal matrix
             const size_t dim = size*size;
-            assert(ptrs.size() == dim);
+            OPENVDB_ASSERT(ptrs.size() == dim);
             llvm::Type* type = arg1.front()->getType();
             llvm::Value* zero = type->isFloatTy() ? LLVMType<float>::get(B.getContext(), 0.0f)
                                     : LLVMType<double>::get(B.getContext(), 0.0);
@@ -1563,9 +1565,9 @@ inline FunctionGroup::UniquePtr axdiag(const FunctionOptions& op)
         }
         else {
             // matrix - convert to vector
-            assert(size == 9 || size == 16);
+            OPENVDB_ASSERT(size == 9 || size == 16);
             const size_t dim = size == 9 ? 3 : 4;
-            assert(ptrs.size() == dim);
+            OPENVDB_ASSERT(ptrs.size() == dim);
             for (size_t i = 0; i < dim; ++i) {
                 B.CreateStore(arg1[i+(i*dim)], ptrs[i]);
             }
@@ -1590,7 +1592,7 @@ inline FunctionGroup::UniquePtr axdiag(const FunctionOptions& op)
             int element = 0;
             for (int i = 0; i < size; ++i) {
                 for (int j = 0; j < size; ++j) {
-                    assert(element < openvdb::ValueTraits<ResultType>::Elements);
+                    OPENVDB_ASSERT(element < openvdb::ValueTraits<ResultType>::Elements);
                     if (i == j) result->asPointer()[element] = (input->asPointer())[i];
                     else        result->asPointer()[element] = ElementT(0.0);
                     ++element;
@@ -1598,11 +1600,11 @@ inline FunctionGroup::UniquePtr axdiag(const FunctionOptions& op)
             }
         }
         else {
-            assert(openvdb::ValueTraits<ValueType>::IsMat);
+            OPENVDB_ASSERT(openvdb::ValueTraits<ValueType>::IsMat);
             // input is a matrix, result is a vec
             const int size = openvdb::ValueTraits<ValueType>::Size;
             for (int i = 0; i < size; ++i) {
-                assert(i < openvdb::ValueTraits<ResultType>::Size);
+                OPENVDB_ASSERT(i < openvdb::ValueTraits<ResultType>::Size);
                 result->asPointer()[i] = input->asPointer()[i+(i*size)];
             }
         }
@@ -1655,7 +1657,7 @@ inline FunctionGroup::UniquePtr axidentity3(const FunctionOptions& op)
     {
         std::vector<llvm::Value*> elements;
         arrayUnpack(args[0], elements, B, /*load elements*/false);
-        assert(elements.size() == 9);
+        OPENVDB_ASSERT(elements.size() == 9);
         llvm::Value* zero = LLVMType<float>::get(B.getContext(), 0.0f);
         llvm::Value* one = LLVMType<float>::get(B.getContext(), 1.0f);
         for (size_t i = 0; i < 9; ++i) {
@@ -1683,7 +1685,7 @@ inline FunctionGroup::UniquePtr axidentity4(const FunctionOptions& op)
     {
         std::vector<llvm::Value*> elements;
         arrayUnpack(args[0], elements, B, /*load elements*/false);
-        assert(elements.size() == 16);
+        OPENVDB_ASSERT(elements.size() == 16);
         llvm::Value* zero = LLVMType<float>::get(B.getContext(), 0.0f);
         llvm::Value* one = LLVMType<float>::get(B.getContext(), 1.0f);
         for (size_t i = 0; i < 16; ++i) {
@@ -1714,9 +1716,9 @@ inline FunctionGroup::UniquePtr axmmmult(const FunctionOptions& op)
         arrayUnpack(args[1], m1, B, /*load*/true);
         arrayUnpack(args[2], m2, B, /*load*/true);
 
-        assert(m1.size() == 9 || m1.size() == 16);
-        assert(ptrs.size() == m1.size());
-        assert(ptrs.size() == m2.size());
+        OPENVDB_ASSERT(m1.size() == 9 || m1.size() == 16);
+        OPENVDB_ASSERT(ptrs.size() == m1.size());
+        OPENVDB_ASSERT(ptrs.size() == m2.size());
         const size_t dim = m1.size() == 9 ? 3 : 4;
 
         llvm::Value* e3 = nullptr, *e4 = nullptr;
@@ -1809,14 +1811,14 @@ inline FunctionGroup::UniquePtr axpostscale(const FunctionOptions& op)
         std::vector<llvm::Value*> m1, v1;
         arrayUnpack(args[0], m1, B, /*load*/false);
         arrayUnpack(args[1], v1, B, /*load*/true);
-        assert(m1.size() == 16);
-        assert(v1.size() == 3);
+        OPENVDB_ASSERT(m1.size() == 16);
+        OPENVDB_ASSERT(v1.size() == 3);
 
         // modify first 3 elements in all mat rows
         for (size_t row = 0; row < 4; ++row) {
             for (size_t col = 0; col < 3; ++col) {
                 const size_t idx = (row*4) + col;
-                assert(idx <= 14);
+                OPENVDB_ASSERT(idx <= 14);
                 llvm::Value* m1v = ir_load(B, m1[idx]);
                 m1v = binaryOperator(m1v, v1[col], ast::tokens::MULTIPLY, B);
                 B.CreateStore(m1v, m1[idx]);
@@ -1861,9 +1863,9 @@ inline FunctionGroup::UniquePtr axpretransform(const FunctionOptions& op)
         const size_t vec = v1.size();
         const size_t dim = (m1.size() == 9 ? 3 : 4);
 
-        assert(m1.size() == 9 || m1.size() == 16);
-        assert(vec == 3 || vec == 4);
-        assert(ptrs.size() == vec);
+        OPENVDB_ASSERT(m1.size() == 9 || m1.size() == 16);
+        OPENVDB_ASSERT(vec == 3 || vec == 4);
+        OPENVDB_ASSERT(ptrs.size() == vec);
 
         // mat * vec
         llvm::Value* e3 = nullptr, *e4 = nullptr;
@@ -1925,14 +1927,14 @@ inline FunctionGroup::UniquePtr axprescale(const FunctionOptions& op)
         std::vector<llvm::Value*> m1, v1;
         arrayUnpack(args[0], m1, B, /*load*/false);
         arrayUnpack(args[1], v1, B, /*load*/true);
-        assert(m1.size() == 16);
-        assert(v1.size() == 3);
+        OPENVDB_ASSERT(m1.size() == 16);
+        OPENVDB_ASSERT(v1.size() == 3);
 
         // modify first 3 mat rows, all columns
         for (size_t row = 0; row < 3; ++row) {
             for (size_t col = 0; col < 4; ++col) {
                 const size_t idx = (row*4) + col;
-                assert(idx <= 11);
+                OPENVDB_ASSERT(idx <= 11);
                 llvm::Value* m1v = ir_load(B, m1[idx]);
                 m1v = binaryOperator(m1v, v1[row], ast::tokens::MULTIPLY, B);
                 B.CreateStore(m1v, m1[idx]);
@@ -1971,7 +1973,7 @@ inline FunctionGroup::UniquePtr axtrace(const FunctionOptions& op)
         std::vector<llvm::Value*> m1;
         arrayUnpack(args[0], m1, B, /*load*/true);
         const size_t dim = (m1.size() == 9 ? 3 : 4);
-        assert(m1.size() == 9 || m1.size() == 16);
+        OPENVDB_ASSERT(m1.size() == 9 || m1.size() == 16);
 
         llvm::Value* result = binaryOperator(m1[0], m1[1+dim], ast::tokens::PLUS, B);
         result = binaryOperator(result, m1[2+(2*dim)], ast::tokens::PLUS, B);
@@ -2027,9 +2029,9 @@ inline FunctionGroup::UniquePtr axtransform(const FunctionOptions& op)
         const size_t vec = v1.size();
         const size_t dim = (m1.size() == 9 ? 3 : 4);
 
-        assert(m1.size() == 9 || m1.size() == 16);
-        assert(vec == 3 || vec == 4);
-        assert(ptrs.size() == vec);
+        OPENVDB_ASSERT(m1.size() == 9 || m1.size() == 16);
+        OPENVDB_ASSERT(vec == 3 || vec == 4);
+        OPENVDB_ASSERT(ptrs.size() == vec);
 
         // vec * mat
         llvm::Value* e3 = nullptr, *e4 = nullptr;
@@ -2091,8 +2093,8 @@ inline FunctionGroup::UniquePtr axtranspose(const FunctionOptions& op)
         std::vector<llvm::Value*> ptrs, m1;
         arrayUnpack(args[0], ptrs, B, /*load*/false);
         arrayUnpack(args[1], m1, B, /*load*/true);
-        assert(m1.size() == 9 || m1.size() == 16);
-        assert(ptrs.size() == m1.size());
+        OPENVDB_ASSERT(m1.size() == 9 || m1.size() == 16);
+        OPENVDB_ASSERT(ptrs.size() == m1.size());
         const size_t dim = m1.size() == 9 ? 3 : 4;
 
         for (size_t i = 0; i < dim; ++i) {
@@ -2138,11 +2140,11 @@ inline FunctionGroup::UniquePtr axadjoint(const FunctionOptions& op)
         [](const std::vector<llvm::Value*>& args,
            llvm::IRBuilder<>& B) -> llvm::Value*
     {
-        assert(args.size() == 2);
+        OPENVDB_ASSERT(args.size() == 2);
         std::vector<llvm::Value*> m1, m2;
         arrayUnpack(args[1], m1, B, /*load*/true);
         arrayUnpack(args[0], m2, B, /*load*/false); // args[0] is return type
-        assert(m1.size() == 9 && m2.size() == 9);
+        OPENVDB_ASSERT(m1.size() == 9 && m2.size() == 9);
 
         auto mul_sub = [&](const size_t a, const size_t b, const size_t c, const size_t d) {
             return binaryOperator(
@@ -2195,11 +2197,11 @@ inline FunctionGroup::UniquePtr axcofactor(const FunctionOptions& op)
         [](const std::vector<llvm::Value*>& args,
            llvm::IRBuilder<>& B) -> llvm::Value*
     {
-        assert(args.size() == 2);
+        OPENVDB_ASSERT(args.size() == 2);
         std::vector<llvm::Value*> m1, m2;
         arrayUnpack(args[1], m1, B, /*load*/true);
         arrayUnpack(args[0], m2, B, /*load*/false); // args[0] is return type
-        assert(m1.size() == 9 && m2.size() == 9);
+        OPENVDB_ASSERT(m1.size() == 9 && m2.size() == 9);
 
         auto mul_sub = [&](const size_t a, const size_t b, const size_t c, const size_t d) {
             return binaryOperator(
@@ -2252,13 +2254,13 @@ inline FunctionGroup::UniquePtr axinverse(const FunctionOptions& op)
         [op](const std::vector<llvm::Value*>& args,
            llvm::IRBuilder<>& B) -> llvm::Value*
     {
-        assert(args.size() == 2);
+        OPENVDB_ASSERT(args.size() == 2);
 
         llvm::Value* adj = axadjoint(op)->execute({args[1]}, B);
         std::vector<llvm::Value*> m1, madj;
         arrayUnpack(adj, madj, B, /*load*/true);
         arrayUnpack(args[0], m1, B, /*load*/false); // result
-        assert(madj.size() == 9 && m1.size() == 9);
+        OPENVDB_ASSERT(madj.size() == 9 && m1.size() == 9);
 
         // compute determinant of the input mat by reusing the adjoint's 0, 3 and 6 terms
         llvm::Value* m20 = ir_load(B, ir_constgep2_64(B, args[1], 0, 0));
@@ -2433,7 +2435,7 @@ inline FunctionGroup::UniquePtr axdegrees(const FunctionOptions& op)
         [](const std::vector<llvm::Value*>& args,
              llvm::IRBuilder<>& B) -> llvm::Value*
     {
-        assert(args.size() == 1);
+        OPENVDB_ASSERT(args.size() == 1);
         llvm::Value* arg = args.front();
         llvm::Value* pi180 = arg->getType()->isFloatTy() ?
             LLVMType<float>::get(B.getContext(), 180.f / openvdb::math::pi<float>()) :
@@ -2460,7 +2462,7 @@ inline FunctionGroup::UniquePtr axradians(const FunctionOptions& op)
         [](const std::vector<llvm::Value*>& args,
              llvm::IRBuilder<>& B) -> llvm::Value*
     {
-        assert(args.size() == 1);
+        OPENVDB_ASSERT(args.size() == 1);
         llvm::Value* arg = args.front();
         llvm::Value* pi180 = arg->getType()->isFloatTy() ?
             LLVMType<float>::get(B.getContext(), openvdb::math::pi<float>() / 180.f) :
@@ -2735,7 +2737,7 @@ inline FunctionGroup::UniquePtr axhsvtorgb(const FunctionOptions& op)
         [op](const std::vector<llvm::Value*>& args,
            llvm::IRBuilder<>& B) -> llvm::Value*
     {
-        assert(args.size() == 2);
+        OPENVDB_ASSERT(args.size() == 2);
         llvm::Function* base = B.GetInsertBlock()->getParent();
 
         std::vector<llvm::Value*> hsv, rgb;
@@ -2899,7 +2901,7 @@ inline FunctionGroup::UniquePtr axrgbtohsv(const FunctionOptions& op)
         [op](const std::vector<llvm::Value*>& args,
            llvm::IRBuilder<>& B) -> llvm::Value*
     {
-        assert(args.size() == 2);
+        OPENVDB_ASSERT(args.size() == 2);
         llvm::Function* base = B.GetInsertBlock()->getParent();
         llvm::LLVMContext& C = B.getContext();
 
@@ -3095,11 +3097,11 @@ inline FunctionGroup::UniquePtr axexternal(const FunctionOptions& op)
     {
         // Pull out the custom data from the parent function
         llvm::Function* compute = B.GetInsertBlock()->getParent();
-        assert(compute);
-        assert(std::string(compute->getName()).rfind("ax.compute", 0) == 0);
+        OPENVDB_ASSERT(compute);
+        OPENVDB_ASSERT(std::string(compute->getName()).rfind("ax.compute", 0) == 0);
         llvm::Value* arg = extractArgument(compute, 0);
-        assert(arg);
-        assert(arg->getName() == "custom_data");
+        OPENVDB_ASSERT(arg);
+        OPENVDB_ASSERT(arg->getName() == "custom_data");
 
         std::vector<llvm::Value*> inputs;
         inputs.reserve(2 + args.size());
@@ -3134,11 +3136,11 @@ inline FunctionGroup::UniquePtr axexternalv(const FunctionOptions& op)
     {
         // Pull out the custom data from the parent function
         llvm::Function* compute = B.GetInsertBlock()->getParent();
-        assert(compute);
-        assert(std::string(compute->getName()).rfind("ax.compute", 0) == 0);
+        OPENVDB_ASSERT(compute);
+        OPENVDB_ASSERT(std::string(compute->getName()).rfind("ax.compute", 0) == 0);
         llvm::Value* arg = extractArgument(compute, 0);
-        assert(arg);
-        assert(arg->getName() == "custom_data");
+        OPENVDB_ASSERT(arg);
+        OPENVDB_ASSERT(arg->getName() == "custom_data");
 
         std::vector<llvm::Value*> inputs;
         inputs.reserve(2 + args.size());
diff --git a/openvdb_ax/openvdb_ax/codegen/String.h b/openvdb_ax/openvdb_ax/codegen/String.h
index b03b0cef2e..65546fd860 100644
--- a/openvdb_ax/openvdb_ax/codegen/String.h
+++ b/openvdb_ax/openvdb_ax/codegen/String.h
@@ -14,10 +14,10 @@
 
 #include <openvdb/version.h>
 #include <openvdb/Types.h>
+#include <openvdb/util/Assert.h>
 
 #include <cstring>
 #include <cstdlib>
-#include <cassert>
 
 namespace openvdb {
 OPENVDB_USE_VERSION_NAMESPACE
@@ -41,7 +41,7 @@ struct String
     String(const std::string& str) : String(str.c_str(), str.size()) {}
     String(const char* str, const int64_t size)
     {
-        assert(str != nullptr);
+        OPENVDB_ASSERT(str != nullptr);
         this->ptr = this->SSO; // for the isLocal check in alloc
         this->reset(str, size);
     }
diff --git a/openvdb_ax/openvdb_ax/codegen/StringFunctions.cc b/openvdb_ax/openvdb_ax/codegen/StringFunctions.cc
index 87405d9b0e..afeceee0c3 100644
--- a/openvdb_ax/openvdb_ax/codegen/StringFunctions.cc
+++ b/openvdb_ax/openvdb_ax/codegen/StringFunctions.cc
@@ -14,6 +14,8 @@
 #include "Utils.h"
 #include "String.h"
 
+#include <openvdb/util/Assert.h>
+
 #include "openvdb_ax/compiler/CompilerOptions.h"
 
 namespace openvdb {
@@ -48,7 +50,7 @@ inline FunctionGroup::UniquePtr axstringalloc(const FunctionOptions& op)
         [](const std::vector<llvm::Value*>& args,
            llvm::IRBuilder<>& B) -> llvm::Value*
     {
-        assert(args.size() == 2);
+        OPENVDB_ASSERT(args.size() == 2);
         llvm::LLVMContext& C = B.getContext();
         llvm::Function* base = B.GetInsertBlock()->getParent();
         llvm::Type* strType = LLVMType<codegen::String>::get(C);
@@ -69,7 +71,7 @@ inline FunctionGroup::UniquePtr axstringalloc(const FunctionOptions& op)
         {
             llvm::BasicBlock* BB = B.GetInsertBlock();
             llvm::Instruction* inst = llvm::CallInst::CreateFree(cptr_load, BB);
-            assert(inst);
+            OPENVDB_ASSERT(inst);
             B.Insert(inst);
             B.CreateBr(post);
         }
@@ -92,7 +94,7 @@ inline FunctionGroup::UniquePtr axstringalloc(const FunctionOptions& op)
                     B.CreateAdd(size, B.getInt64(1)), // size
                     nullptr,
                     nullptr);
-            assert(inst);
+            OPENVDB_ASSERT(inst);
             B.Insert(inst);
             B.CreateStore(inst, cptr);
             B.CreateBr(post);
@@ -132,7 +134,7 @@ inline FunctionGroup::UniquePtr axstring(const FunctionOptions& op)
         [op](const std::vector<llvm::Value*>& args,
            llvm::IRBuilder<>& B) -> llvm::Value*
     {
-        assert(args.size() >= 1);
+        OPENVDB_ASSERT(args.size() >= 1);
 
         llvm::LLVMContext& C = B.getContext();
         llvm::Type* strType = LLVMType<codegen::String>::get(C);
@@ -141,7 +143,7 @@ inline FunctionGroup::UniquePtr axstring(const FunctionOptions& op)
         llvm::Value* carr;
         if (args.size() == 1) carr = B.CreateGlobalStringPtr("");
         else                  carr = args[1];
-        assert(carr);
+        OPENVDB_ASSERT(carr);
         llvm::Value* slen = axstrlen(op)->execute({carr}, B);
 
         llvm::Value* cptr = B.CreateStructGEP(strType, str, 0); // char**
@@ -196,7 +198,7 @@ inline FunctionGroup::UniquePtr axstringassign(const FunctionOptions& op)
         [op](const std::vector<llvm::Value*>& args,
            llvm::IRBuilder<>& B) -> llvm::Value*
     {
-        assert(args.size() == 2);
+        OPENVDB_ASSERT(args.size() == 2);
         llvm::Type* strType = LLVMType<codegen::String>::get(B.getContext());
         llvm::Value* str0 = args[0];
         llvm::Value* str1 = args[1];
diff --git a/openvdb_ax/openvdb_ax/codegen/SymbolTable.h b/openvdb_ax/openvdb_ax/codegen/SymbolTable.h
index e6d8d571cf..e03d885ef5 100644
--- a/openvdb_ax/openvdb_ax/codegen/SymbolTable.h
+++ b/openvdb_ax/openvdb_ax/codegen/SymbolTable.h
@@ -13,6 +13,7 @@
 #define OPENVDB_AX_CODEGEN_SYMBOL_TABLE_HAS_BEEN_INCLUDED
 
 #include <openvdb/version.h>
+#include <openvdb/util/Assert.h>
 
 #include <llvm/IR/Value.h>
 
@@ -178,7 +179,7 @@ struct SymbolTableBlocks
         // reverse the iterator (which also make it point to the preceding
         // value, hence the crement)
 
-        assert(it != mTables.end());
+        OPENVDB_ASSERT(it != mTables.end());
         MapType::const_reverse_iterator iter(++it);
 
         for (; iter != mTables.crend(); ++iter) {
diff --git a/openvdb_ax/openvdb_ax/codegen/Types.h b/openvdb_ax/openvdb_ax/codegen/Types.h
index 68242ab8d2..fb07126234 100644
--- a/openvdb_ax/openvdb_ax/codegen/Types.h
+++ b/openvdb_ax/openvdb_ax/codegen/Types.h
@@ -20,6 +20,7 @@
 #include <openvdb/math/Mat3.h>
 #include <openvdb/math/Mat4.h>
 #include <openvdb/math/Vec3.h>
+#include <openvdb/util/Assert.h>
 
 #include <llvm/IR/Constants.h>
 #include <llvm/IR/IRBuilder.h>
@@ -102,19 +103,19 @@ struct LLVMType
         llvm::Constant* constant = nullptr;
 
         if (std::is_floating_point<T>::value) {
-            assert(llvm::ConstantFP::isValueValidForType(type,
+            OPENVDB_ASSERT(llvm::ConstantFP::isValueValidForType(type,
                 llvm::APFloat(static_cast<typename std::conditional
                     <std::is_floating_point<T>::value, T, double>::type>(V))));
             constant = llvm::ConstantFP::get(type, static_cast<double>(V));
         }
         else if (std::is_integral<T>::value) {
             const constexpr bool isSigned = std::is_signed<T>::value;
-            assert((isSigned && llvm::ConstantInt::isValueValidForType(type, static_cast<int64_t>(V))) ||
+            OPENVDB_ASSERT((isSigned && llvm::ConstantInt::isValueValidForType(type, static_cast<int64_t>(V))) ||
                    (!isSigned && llvm::ConstantInt::isValueValidForType(type, static_cast<uint64_t>(V))));
             constant = llvm::ConstantInt::get(type, static_cast<uint64_t>(V), isSigned);
         }
 
-        assert(constant);
+        OPENVDB_ASSERT(constant);
         return constant;
     }
 
@@ -209,9 +210,9 @@ template <> struct LLVMType<openvdb::math::half>
     static inline llvm::Constant* get(llvm::LLVMContext& C, const openvdb::math::half V)
     {
         llvm::Type* type = LLVMType<openvdb::math::half>::get(C);
-        assert(llvm::ConstantFP::isValueValidForType(type, llvm::APFloat(V)));
+        OPENVDB_ASSERT(llvm::ConstantFP::isValueValidForType(type, llvm::APFloat(V)));
         llvm::Constant* constant = llvm::ConstantFP::get(type, static_cast<double>(V));
-        assert(constant);
+        OPENVDB_ASSERT(constant);
         return constant;
     }
     static inline llvm::Constant* get(llvm::LLVMContext& C, const openvdb::math::half* const V)
@@ -334,7 +335,7 @@ llvmConstant(const T t, llvm::Type* type)
         return llvm::ConstantInt::get(type, static_cast<uint64_t>(t), /*signed*/true);
     }
     else {
-        assert(type->isFloatingPointTy());
+        OPENVDB_ASSERT(type->isFloatingPointTy());
         return llvm::ConstantFP::get(type, static_cast<double>(t));
     }
 }
diff --git a/openvdb_ax/openvdb_ax/codegen/Utils.h b/openvdb_ax/openvdb_ax/codegen/Utils.h
index bb7c0b2337..7f62d50eac 100644
--- a/openvdb_ax/openvdb_ax/codegen/Utils.h
+++ b/openvdb_ax/openvdb_ax/codegen/Utils.h
@@ -18,6 +18,7 @@
 #include "../Exceptions.h"
 
 #include <openvdb/version.h>
+#include <openvdb/util/Assert.h>
 
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/IR/LLVMContext.h>
@@ -53,8 +54,8 @@ using BinaryFunction = std::function<llvm::Value*
 /// @brief  Alias around IR load inst.
 inline auto ir_load(llvm::IRBuilder<>& B, llvm::Value* ptr, const char* Name = "")
 {
-    assert(ptr);
-    assert(ptr->getType()->isPointerTy());
+    OPENVDB_ASSERT(ptr);
+    OPENVDB_ASSERT(ptr->getType()->isPointerTy());
 #if LLVM_VERSION_MAJOR <= 7
     return B.CreateLoad(ptr, Name);
 #else
@@ -66,9 +67,9 @@ inline auto ir_load(llvm::IRBuilder<>& B, llvm::Value* ptr, const char* Name = "
 inline auto ir_gep(llvm::IRBuilder<>& B,
     llvm::Value* ptr, llvm::ArrayRef<llvm::Value*> IdxList, const char* Name = "")
 {
-    assert(ptr);
-    assert(ptr->getType()->getScalarType());
-    assert(ptr->getType()->getScalarType()->isPointerTy());
+    OPENVDB_ASSERT(ptr);
+    OPENVDB_ASSERT(ptr->getType()->getScalarType());
+    OPENVDB_ASSERT(ptr->getType()->getScalarType()->isPointerTy());
 #if LLVM_VERSION_MAJOR <= 7
     return B.CreateGEP(ptr, IdxList, Name);
 #else
@@ -81,9 +82,9 @@ inline auto ir_gep(llvm::IRBuilder<>& B,
 inline auto ir_constgep2_64(llvm::IRBuilder<>& B,
     llvm::Value* ptr, uint64_t Idx0, uint64_t Idx1, const char* Name = "")
 {
-    assert(ptr);
-    assert(ptr->getType()->getScalarType());
-    assert(ptr->getType()->getScalarType()->isPointerTy());
+    OPENVDB_ASSERT(ptr);
+    OPENVDB_ASSERT(ptr->getType()->getScalarType());
+    OPENVDB_ASSERT(ptr->getType()->getScalarType()->isPointerTy());
 #if LLVM_VERSION_MAJOR <= 7
     return B.CreateConstGEP2_64(ptr, Idx0, Idx1, Name);
 #else
@@ -97,9 +98,9 @@ inline auto ir_constgep2_64(llvm::IRBuilder<>& B,
 inline auto ir_constinboundsgep2_64(llvm::IRBuilder<>& B,
     llvm::Value* ptr, uint64_t Idx0, uint64_t Idx1, const char* Name = "")
 {
-    assert(ptr);
-    assert(ptr->getType()->getScalarType());
-    assert(ptr->getType()->getScalarType()->isPointerTy());
+    OPENVDB_ASSERT(ptr);
+    OPENVDB_ASSERT(ptr->getType()->getScalarType());
+    OPENVDB_ASSERT(ptr->getType()->getScalarType()->isPointerTy());
 #if LLVM_VERSION_MAJOR <= 7
     return B.CreateConstInBoundsGEP2_64(ptr, Idx0, Idx1, Name);
 #else
@@ -190,7 +191,7 @@ insertStaticAlloca(llvm::IRBuilder<>& B,
     llvm::Type* strtype = LLVMType<codegen::String>::get(B.getContext());
     // Create the allocation at the start of the function block
     llvm::Function* parent = B.GetInsertBlock()->getParent();
-    assert(parent && !parent->empty());
+    OPENVDB_ASSERT(parent && !parent->empty());
     auto IP = B.saveIP();
     llvm::BasicBlock& block = parent->front();
     if (block.empty()) B.SetInsertPoint(&block);
@@ -242,9 +243,9 @@ inline llvm::Type*
 typePrecedence(llvm::Type* const typeA,
                llvm::Type* const typeB)
 {
-    assert(typeA && (typeA->isIntegerTy() || typeA->isFloatingPointTy()) &&
+    OPENVDB_ASSERT(typeA && (typeA->isIntegerTy() || typeA->isFloatingPointTy()) &&
         "First Type in typePrecedence is not a scalar type");
-    assert(typeB && (typeB->isIntegerTy() || typeB->isFloatingPointTy()) &&
+    OPENVDB_ASSERT(typeB && (typeB->isIntegerTy() || typeB->isFloatingPointTy()) &&
         "Second Type in typePrecedence is not a scalar type");
 
     // handle implicit arithmetic conversion
@@ -271,7 +272,7 @@ typePrecedence(llvm::Type* const typeA,
     if (typeA->isIntegerTy(1)) return typeA;
     if (typeB->isIntegerTy(1)) return typeB;
 
-    assert(false && "invalid LLVM type precedence");
+    OPENVDB_ASSERT(false && "invalid LLVM type precedence");
     return nullptr;
 }
 
@@ -371,7 +372,7 @@ llvmArithmeticConversion(const llvm::Type* const sourceType,
     }
 
 #undef BIND_ARITHMETIC_CAST_OP
-    assert(false && "invalid LLVM type conversion");
+    OPENVDB_ASSERT(false && "invalid LLVM type conversion");
     return CastFunction();
 }
 
@@ -404,7 +405,7 @@ llvmBinaryConversion(const llvm::Type* const type,
     // a%b in AX is implemented as a floored modulo op and is handled explicitly in binaryExpression
 
     if (type->isFloatingPointTy()) {
-        assert(!(ast::tokens::operatorType(token) == ast::tokens::LOGICAL ||
+        OPENVDB_ASSERT(!(ast::tokens::operatorType(token) == ast::tokens::LOGICAL ||
             ast::tokens::operatorType(token) == ast::tokens::BITWISE)
                 && "unable to perform logical or bitwise operation on floating point values");
 
@@ -419,7 +420,7 @@ llvmBinaryConversion(const llvm::Type* const type,
         else if (token == ast::tokens::LESSTHAN)        return BIND_BINARY_OP(CreateFCmpOLT);
         else if (token == ast::tokens::MORETHANOREQUAL) return BIND_BINARY_OP(CreateFCmpOGE);
         else if (token == ast::tokens::LESSTHANOREQUAL) return BIND_BINARY_OP(CreateFCmpOLE);
-        assert(false && "unrecognised binary operator");
+        OPENVDB_ASSERT(false && "unrecognised binary operator");
     }
     else if (type->isIntegerTy()) {
         if (token == ast::tokens::PLUS)                  return BIND_BINARY_OP(CreateAdd); // No Unsigned/Signed Wrap
@@ -440,11 +441,11 @@ llvmBinaryConversion(const llvm::Type* const type,
         else if (token == ast::tokens::BITAND)           return BIND_BINARY_OP(CreateAnd);
         else if (token == ast::tokens::BITOR)            return BIND_BINARY_OP(CreateOr);
         else if (token == ast::tokens::BITXOR)           return BIND_BINARY_OP(CreateXor);
-        assert(false && "unrecognised binary operator");
+        OPENVDB_ASSERT(false && "unrecognised binary operator");
     }
 
 #undef BIND_BINARY_OP
-    assert(false && "invalid LLVM type for binary operation");
+    OPENVDB_ASSERT(false && "invalid LLVM type for binary operation");
     return BinaryFunction();
 }
 
@@ -452,8 +453,8 @@ llvmBinaryConversion(const llvm::Type* const type,
 ///         Type 'to'.
 inline bool isValidCast(llvm::Type* from, llvm::Type* to)
 {
-    assert(from && "llvm Type 'from' is null in isValidCast");
-    assert(to && "llvm Type 'to' is null in isValidCast");
+    OPENVDB_ASSERT(from && "llvm Type 'from' is null in isValidCast");
+    OPENVDB_ASSERT(to && "llvm Type 'to' is null in isValidCast");
 
     if ((from->isIntegerTy() || from->isFloatingPointTy()) &&
         (to->isIntegerTy() || to->isFloatingPointTy())) {
@@ -481,9 +482,9 @@ arithmeticConversion(llvm::Value* value,
                      llvm::Type* targetType,
                      llvm::IRBuilder<>& builder)
 {
-    assert(value && (value->getType()->isIntegerTy() || value->getType()->isFloatingPointTy()) &&
+    OPENVDB_ASSERT(value && (value->getType()->isIntegerTy() || value->getType()->isFloatingPointTy()) &&
         "First Value in arithmeticConversion is not a scalar type");
-    assert(targetType && (targetType->isIntegerTy() || targetType->isFloatingPointTy()) &&
+    OPENVDB_ASSERT(targetType && (targetType->isIntegerTy() || targetType->isFloatingPointTy()) &&
         "Target Type in arithmeticConversion is not a scalar type");
 
     const llvm::Type* const valueType = value->getType();
@@ -507,18 +508,18 @@ arrayCast(llvm::Value* ptrToArray,
           llvm::Type* targetElementType,
           llvm::IRBuilder<>& builder)
 {
-    assert(targetElementType && (targetElementType->isIntegerTy() ||
+    OPENVDB_ASSERT(targetElementType && (targetElementType->isIntegerTy() ||
         targetElementType->isFloatingPointTy()) &&
         "Target element type is not a scalar type");
-    assert(ptrToArray && ptrToArray->getType()->isPointerTy() &&
+    OPENVDB_ASSERT(ptrToArray && ptrToArray->getType()->isPointerTy() &&
         "Input to arrayCast is not a pointer type.");
 
     llvm::Type* arrayType = ptrToArray->getType()->getContainedType(0);
-    assert(arrayType && llvm::isa<llvm::ArrayType>(arrayType));
+    OPENVDB_ASSERT(arrayType && llvm::isa<llvm::ArrayType>(arrayType));
 
     // getArrayElementType() calls getContainedType(0)
     llvm::Type* sourceElementType = arrayType->getArrayElementType();
-    assert(sourceElementType && (sourceElementType->isIntegerTy() ||
+    OPENVDB_ASSERT(sourceElementType && (sourceElementType->isIntegerTy() ||
         sourceElementType->isFloatingPointTy()) &&
         "Source element type is not a scalar type");
 
@@ -555,12 +556,12 @@ arithmeticConversion(std::vector<llvm::Value*>& values,
                      llvm::Type* targetElementType,
                      llvm::IRBuilder<>& builder)
 {
-    assert(targetElementType && (targetElementType->isIntegerTy() ||
+    OPENVDB_ASSERT(targetElementType && (targetElementType->isIntegerTy() ||
         targetElementType->isFloatingPointTy()) &&
         "Target element type is not a scalar type");
 
     llvm::Type* sourceElementType = values.front()->getType();
-    assert(sourceElementType && (sourceElementType->isIntegerTy() ||
+    OPENVDB_ASSERT(sourceElementType && (sourceElementType->isIntegerTy() ||
         sourceElementType->isFloatingPointTy()) &&
         "Source element type is not a scalar type");
 
@@ -625,7 +626,7 @@ boolComparison(llvm::Value* value,
     if (type->isFloatingPointTy())  return builder.CreateFCmpONE(value, llvm::ConstantFP::get(type, 0.0));
     else if (type->isIntegerTy(1))  return builder.CreateICmpNE(value, llvm::ConstantInt::get(type, 0));
     else if (type->isIntegerTy())   return builder.CreateICmpNE(value, llvm::ConstantInt::getSigned(type, 0));
-    assert(false && "Invalid type for bool conversion");
+    OPENVDB_ASSERT(false && "Invalid type for bool conversion");
     return nullptr;
 }
 
@@ -643,7 +644,7 @@ binaryOperator(llvm::Value* lhs, llvm::Value* rhs,
                llvm::IRBuilder<>& builder)
 {
     llvm::Type* lhsType = lhs->getType();
-    assert(lhsType == rhs->getType() ||
+    OPENVDB_ASSERT(lhsType == rhs->getType() ||
         (token == ast::tokens::SHIFTLEFT ||
          token == ast::tokens::SHIFTRIGHT));
 
@@ -718,7 +719,7 @@ array3Unpack(llvm::Value* ptrToArray,
              llvm::Value*& value3,
              llvm::IRBuilder<>& builder)
 {
-    assert(ptrToArray && ptrToArray->getType()->isPointerTy() &&
+    OPENVDB_ASSERT(ptrToArray && ptrToArray->getType()->isPointerTy() &&
         "Input to array3Unpack is not a pointer type.");
 
     value1 = ir_constgep2_64(builder, ptrToArray, 0, 0);
@@ -776,7 +777,7 @@ arrayPack(llvm::Value* value,
           llvm::IRBuilder<>& builder,
           const size_t size = 3)
 {
-    assert(value && (value->getType()->isIntegerTy() ||
+    OPENVDB_ASSERT(value && (value->getType()->isIntegerTy() ||
         value->getType()->isFloatingPointTy()) &&
         "value type is not a scalar type");
 
@@ -850,7 +851,7 @@ scalarToMatrix(llvm::Value* scalar,
      llvm::IRBuilder<>& builder,
      const size_t dim = 3)
 {
-    assert(scalar && (scalar->getType()->isIntegerTy() ||
+    OPENVDB_ASSERT(scalar && (scalar->getType()->isIntegerTy() ||
         scalar->getType()->isFloatingPointTy()) &&
         "value type is not a scalar type");
 
diff --git a/openvdb_ax/openvdb_ax/codegen/VolumeComputeGenerator.cc b/openvdb_ax/openvdb_ax/codegen/VolumeComputeGenerator.cc
index 2684a1679d..3579ac5e13 100644
--- a/openvdb_ax/openvdb_ax/codegen/VolumeComputeGenerator.cc
+++ b/openvdb_ax/openvdb_ax/codegen/VolumeComputeGenerator.cc
@@ -9,6 +9,8 @@
 #include "Types.h"
 #include "Utils.h"
 
+#include <openvdb/util/Assert.h>
+
 #include "../Exceptions.h"
 #include "../ast/Scanners.h"
 
@@ -91,15 +93,15 @@ inline void VolumeComputeGenerator::computek2(llvm::Function* compute, const Att
         [&](const std::vector<llvm::Value*>& args,
             llvm::IRBuilder<>& B) -> llvm::Value*
     {
-        assert(args.size() == 9);
+        OPENVDB_ASSERT(args.size() == 9);
         llvm::Value* vbuff = args[2]; //extractArgument(rangeFunction, "value_buffer");
         llvm::Value* abuff = args[3]; //extractArgument(rangeFunction, "active_buffer");
         llvm::Value* buffSize = args[4]; //extractArgument(rangeFunction, "buffer_size");
         llvm::Value* mode = args[5]; //extractArgument(rangeFunction, "mode");
-        assert(buffSize);
-        assert(vbuff);
-        assert(abuff);
-        assert(mode);
+        OPENVDB_ASSERT(buffSize);
+        OPENVDB_ASSERT(vbuff);
+        OPENVDB_ASSERT(abuff);
+        OPENVDB_ASSERT(mode);
 
         llvm::Function* base = B.GetInsertBlock()->getParent();
         llvm::LLVMContext& C = B.getContext();
@@ -203,7 +205,7 @@ inline void VolumeComputeGenerator::computek3(llvm::Function* compute, const Att
         [&, this](const std::vector<llvm::Value*>& args,
             llvm::IRBuilder<>& B) -> llvm::Value*
     {
-        assert(args.size() == 6);
+        OPENVDB_ASSERT(args.size() == 6);
         llvm::Value* isc = args[1]; // index space coord
         llvm::Value* wi = args[4]; // write index
         llvm::Value* wa = args[5]; // write_accessor
@@ -219,7 +221,7 @@ inline void VolumeComputeGenerator::computek3(llvm::Function* compute, const Att
             type = type->getPointerElementType();
 
             llvm::Value* registeredIndex = this->mModule.getGlobalVariable(token);
-            assert(registeredIndex);
+            OPENVDB_ASSERT(registeredIndex);
             registeredIndex = ir_load(B, registeredIndex);
             llvm::Value* result = B.CreateICmpEQ(wi, registeredIndex);
 
@@ -337,13 +339,13 @@ AttributeRegistry::Ptr VolumeComputeGenerator::generate(const ast::Tree& tree)
         {
             llvm::Value* vptr = mBuilder.CreateAlloca(type->getPointerTo(0));
             localTable->insert(data.tokenname() + "_vptr", vptr);
-            assert(llvm::cast<llvm::AllocaInst>(vptr)->isStaticAlloca());
+            OPENVDB_ASSERT(llvm::cast<llvm::AllocaInst>(vptr)->isStaticAlloca());
         }
 
         // @warning This method will insert the alloc before the above alloc.
         //  This is fine, but is worth noting
         llvm::Value* value = insertStaticAlloca(mBuilder, type);
-        assert(llvm::cast<llvm::AllocaInst>(value)->isStaticAlloca());
+        OPENVDB_ASSERT(llvm::cast<llvm::AllocaInst>(value)->isStaticAlloca());
 
         // @note  this technically doesn't need to live in the local table
         //  (only the pointer to this value (_vptr) needs to) but it's
@@ -382,7 +384,7 @@ bool VolumeComputeGenerator::visit(const ast::Attribute* node)
     llvm::Value* value;
     value = localTable->get(globalName + "_vptr");
     value = ir_load(mBuilder, value);
-    assert(value);
+    OPENVDB_ASSERT(value);
     mValues.push(value);
     return true;
 }
@@ -417,8 +419,8 @@ void VolumeComputeGenerator::getAccessorValue(const std::string& globalName, llv
     {
         llvm::Value* valueptr = extractArgument(mFunction, "value");
         llvm::Value* offset = extractArgument(mFunction, "offset");
-        assert(valueptr);
-        assert(offset);
+        OPENVDB_ASSERT(valueptr);
+        OPENVDB_ASSERT(offset);
 
         llvm::Type* type = location->getType(); // ValueType*
         valueptr = mBuilder.CreatePointerCast(valueptr, type);
@@ -436,10 +438,10 @@ void VolumeComputeGenerator::getAccessorValue(const std::string& globalName, llv
         llvm::Value* transformPtr = extractArgument(mFunction, "transforms");
         llvm::Value* origin = extractArgument(mFunction, "origin");
         llvm::Value* offset = extractArgument(mFunction, "offset");
-        assert(accessorPtr);
-        assert(transformPtr);
-        assert(origin);
-        assert(offset);
+        OPENVDB_ASSERT(accessorPtr);
+        OPENVDB_ASSERT(transformPtr);
+        OPENVDB_ASSERT(origin);
+        OPENVDB_ASSERT(offset);
 
         accessorPtr = ir_gep(mBuilder, accessorPtr, registeredIndex);
         llvm::Value* targetTransform = ir_gep(mBuilder, transformPtr, registeredIndex);
@@ -473,7 +475,7 @@ llvm::Value* VolumeComputeGenerator::accessorHandleFromToken(const std::string&
     // The result is a loaded void* value
 
     llvm::Value* accessorPtr = extractArgument(mFunction, "accessors");
-    assert(accessorPtr);
+    OPENVDB_ASSERT(accessorPtr);
     accessorPtr = ir_gep(mBuilder, accessorPtr, registeredIndex);
 
     // return loaded void** = void*
diff --git a/openvdb_ax/openvdb_ax/codegen/VolumeFunctions.cc b/openvdb_ax/openvdb_ax/codegen/VolumeFunctions.cc
index ef84dc72e1..26a7683be9 100644
--- a/openvdb_ax/openvdb_ax/codegen/VolumeFunctions.cc
+++ b/openvdb_ax/openvdb_ax/codegen/VolumeFunctions.cc
@@ -20,6 +20,7 @@
 #include "openvdb_ax/Exceptions.h"
 
 #include <openvdb/version.h>
+#include <openvdb/util/Assert.h>
 
 #include <unordered_map>
 #include <cstdlib>
@@ -65,7 +66,7 @@ inline FunctionGroup::UniquePtr axcoordtooffset(const FunctionOptions& op)
     static auto generate = [](const std::vector<llvm::Value*>& args,
          llvm::IRBuilder<>& B) -> llvm::Value*
     {
-        assert(args.size() == 1);
+        OPENVDB_ASSERT(args.size() == 1);
         OPENVDB_AX_CHECK_MODULE_CONTEXT(B);
         llvm::Value* x = ir_constgep2_64(B, args[0], 0, 0);
         llvm::Value* y = ir_constgep2_64(B, args[0], 0, 1);
@@ -132,7 +133,7 @@ inline FunctionGroup::UniquePtr axoffsettocoord(const FunctionOptions& op)
     static auto generate = [](const std::vector<llvm::Value*>& args,
          llvm::IRBuilder<>& B) -> llvm::Value*
     {
-        assert(args.size() == 2);
+        OPENVDB_ASSERT(args.size() == 2);
         OPENVDB_AX_CHECK_MODULE_CONTEXT(B);
 
         llvm::Value* ijk = args[0];
@@ -200,7 +201,7 @@ inline FunctionGroup::UniquePtr axoffsettoglobalcoord(const FunctionOptions& op)
     auto generate = [op](const std::vector<llvm::Value*>& args,
          llvm::IRBuilder<>& B) -> llvm::Value*
     {
-        assert(args.size() == 3);
+        OPENVDB_ASSERT(args.size() == 3);
         OPENVDB_AX_CHECK_MODULE_CONTEXT(B);
 
         llvm::Value* result = args[0];
@@ -385,15 +386,15 @@ inline FunctionGroup::UniquePtr axsetvoxel(const FunctionOptions& op)
         using RootNodeType = typename GridType::TreeType::RootNodeType;
         using AccessorType = typename GridType::Accessor;
 
-        assert(accessor);
-        assert(coord);
+        OPENVDB_ASSERT(accessor);
+        OPENVDB_ASSERT(coord);
 
         // set value only to avoid changing topology
         const openvdb::Coord* ijk = reinterpret_cast<const openvdb::Coord*>(coord);
         AccessorType* const accessorPtr = static_cast<AccessorType*>(accessor);
 
         if (level != -1) {
-            assert(level >= 0);
+            OPENVDB_ASSERT(level >= 0);
             accessorPtr->addTile(Index(level), *ijk, *value, ison);
         }
         else {
@@ -402,7 +403,7 @@ inline FunctionGroup::UniquePtr axsetvoxel(const FunctionOptions& op)
             const int depth = accessorPtr->getValueDepth(*ijk);
             if (depth == static_cast<int>(RootNodeType::LEVEL)) {
                 // voxel/leaf level
-                assert(accessorPtr->probeConstLeaf(*ijk));
+                OPENVDB_ASSERT(accessorPtr->probeConstLeaf(*ijk));
                 if (ison) accessorPtr->setValueOn(*ijk, *value);
                 else      accessorPtr->setValueOff(*ijk, *value);
             }
@@ -420,12 +421,12 @@ inline FunctionGroup::UniquePtr axsetvoxel(const FunctionOptions& op)
                 using NodeT2 = typename AccessorType::template NodeTypeAtLevel<2>;
                 if (NodeT1* node = accessorPtr->template getNode<NodeT1>()) {
                     const openvdb::Index index = node->coordToOffset(*ijk);
-                    assert(node->isChildMaskOff(index));
+                    OPENVDB_ASSERT(node->isChildMaskOff(index));
                     node->addTile(index, *value, ison);
                 }
                 else if (NodeT2* node = accessorPtr->template getNode<NodeT2>()) {
                     const openvdb::Index index = node->coordToOffset(*ijk);
-                    assert(node->isChildMaskOff(index));
+                    OPENVDB_ASSERT(node->isChildMaskOff(index));
                     node->addTile(index, *value, ison);
                 }
                 else {
@@ -533,9 +534,9 @@ inline FunctionGroup::UniquePtr axgetvoxel(const FunctionOptions& op)
         using GridType = typename openvdb::BoolGrid::ValueConverter<ValueType>::Type;
         using AccessorType = typename GridType::Accessor;
 
-        assert(accessor);
-        assert(coord);
-        assert(value);
+        OPENVDB_ASSERT(accessor);
+        OPENVDB_ASSERT(coord);
+        OPENVDB_ASSERT(value);
 
         const openvdb::Coord* ijk = reinterpret_cast<const openvdb::Coord*>(coord);
         (*value) = static_cast<const AccessorType*>(accessor)->getValue(*ijk);
@@ -549,9 +550,9 @@ inline FunctionGroup::UniquePtr axgetvoxel(const FunctionOptions& op)
         using GridType = openvdb::BoolGrid::ValueConverter<std::string>::Type;
         using AccessorType = GridType::Accessor;
 
-        assert(accessor);
-        assert(coord);
-        assert(value);
+        OPENVDB_ASSERT(accessor);
+        OPENVDB_ASSERT(coord);
+        OPENVDB_ASSERT(value);
 
         const openvdb::Coord* ijk = reinterpret_cast<const openvdb::Coord*>(coord);
         const std::string& str = static_cast<const AccessorType*>(accessor)->getValue(*ijk);
@@ -572,10 +573,10 @@ inline FunctionGroup::UniquePtr axgetvoxel(const FunctionOptions& op)
         using LeafNodeT = typename GridType::TreeType::LeafNodeType;
         using AccessorType = typename GridType::Accessor;
 
-        assert(accessor);
-        assert(origin);
-        assert(sourceTransform);
-        assert(targetTransform);
+        OPENVDB_ASSERT(accessor);
+        OPENVDB_ASSERT(origin);
+        OPENVDB_ASSERT(sourceTransform);
+        OPENVDB_ASSERT(targetTransform);
 
         const AccessorType* const accessorPtr = static_cast<const AccessorType*>(accessor);
         const openvdb::math::Transform* const sourceTransformPtr =
@@ -601,10 +602,10 @@ inline FunctionGroup::UniquePtr axgetvoxel(const FunctionOptions& op)
         using LeafNodeT = typename GridType::TreeType::LeafNodeType;
         using AccessorType = typename GridType::Accessor;
 
-        assert(accessor);
-        assert(origin);
-        assert(sourceTransform);
-        assert(targetTransform);
+        OPENVDB_ASSERT(accessor);
+        OPENVDB_ASSERT(origin);
+        OPENVDB_ASSERT(sourceTransform);
+        OPENVDB_ASSERT(targetTransform);
 
         const AccessorType* const accessorPtr = static_cast<const AccessorType*>(accessor);
         const openvdb::math::Transform* const sourceTransformPtr =
@@ -737,10 +738,10 @@ inline FunctionGroup::UniquePtr axprobevalue(const FunctionOptions& op)
         using GridType = typename openvdb::BoolGrid::ValueConverter<ValueType>::Type;
         using AccessorType = typename GridType::Accessor;
 
-        assert(accessor);
-        assert(coord);
-        assert(value);
-        assert(ison);
+        OPENVDB_ASSERT(accessor);
+        OPENVDB_ASSERT(coord);
+        OPENVDB_ASSERT(value);
+        OPENVDB_ASSERT(ison);
 
         const openvdb::Coord* ijk = reinterpret_cast<const openvdb::Coord*>(coord);
         *ison = static_cast<const AccessorType*>(accessor)->probeValue(*ijk, *value);
@@ -755,10 +756,10 @@ inline FunctionGroup::UniquePtr axprobevalue(const FunctionOptions& op)
         using GridType = openvdb::BoolGrid::ValueConverter<std::string>::Type;
         using AccessorType = GridType::Accessor;
 
-        assert(accessor);
-        assert(coord);
-        assert(value);
-        assert(ison);
+        OPENVDB_ASSERT(accessor);
+        OPENVDB_ASSERT(coord);
+        OPENVDB_ASSERT(value);
+        OPENVDB_ASSERT(ison);
 
         const openvdb::Coord* ijk = reinterpret_cast<const openvdb::Coord*>(coord);
 
diff --git a/openvdb_ax/openvdb_ax/compiler/AttributeRegistry.h b/openvdb_ax/openvdb_ax/compiler/AttributeRegistry.h
index 6e0b62d77a..d5aab1f48b 100644
--- a/openvdb_ax/openvdb_ax/compiler/AttributeRegistry.h
+++ b/openvdb_ax/openvdb_ax/compiler/AttributeRegistry.h
@@ -23,6 +23,7 @@
 #include <openvdb/version.h>
 #include <openvdb/Types.h>
 #include <openvdb/util/Name.h>
+#include <openvdb/util/Assert.h>
 
 #include <unordered_map>
 
@@ -69,7 +70,7 @@ class AttributeRegistry
         const std::vector<const AccessData*>& uses() const { return mUses; }
 
         bool dependson(const AccessData* data) const {
-            assert(data);
+            OPENVDB_ASSERT(data);
             for (auto& dep : mDependencies) {
                 if (dep == data) return true;
             }
@@ -227,11 +228,11 @@ inline AttributeRegistry::Ptr AttributeRegistry::create(const ast::Tree& tree)
             ast::attributeDependencyTokens(tree, name, typetoken, deps);
             if (deps.empty()) continue;
 
-            assert(indexmap.find(attrib) != indexmap.cend());
+            OPENVDB_ASSERT(indexmap.find(attrib) != indexmap.cend());
             const size_t index = indexmap.at(attrib);
             AccessData& access = registry->mAccesses[index];
             for (const std::string& dep : deps) {
-                assert(indexmap.find(dep) != indexmap.cend());
+                OPENVDB_ASSERT(indexmap.find(dep) != indexmap.cend());
                 const size_t depindex = indexmap.at(dep);
                 access.mDependencies.emplace_back(&registry->mAccesses[depindex]);
             }
diff --git a/openvdb_ax/openvdb_ax/compiler/Compiler.cc b/openvdb_ax/openvdb_ax/compiler/Compiler.cc
index 1f2d7b8470..2e171710ce 100644
--- a/openvdb_ax/openvdb_ax/compiler/Compiler.cc
+++ b/openvdb_ax/openvdb_ax/compiler/Compiler.cc
@@ -15,6 +15,7 @@
 #include "openvdb_ax/Exceptions.h"
 
 #include <openvdb/Exceptions.h>
+#include <openvdb/util/Assert.h>
 
 #include <llvm/ADT/Optional.h>
 #include <llvm/ADT/Triple.h>
@@ -450,7 +451,7 @@ bool initializeGlobalFunctions(const codegen::FunctionRegistry& registry,
             getMangledName(llvm::cast<llvm::GlobalValue>(&F), engine);
         const uint64_t address =
             engine.getAddressToGlobalIfAvailable(mangled);
-        assert(address != 0 && "Unbound function!");
+        OPENVDB_ASSERT(address != 0 && "Unbound function!");
     }
 #endif
 
@@ -524,7 +525,7 @@ registerAccesses(const codegen::SymbolTable& globals, const AttributeRegistry& r
         const size_t index = registry.accessIndex(name, typetoken);
 
         // should always be a GlobalVariable.
-        assert(llvm::isa<llvm::GlobalVariable>(global.second));
+        OPENVDB_ASSERT(llvm::isa<llvm::GlobalVariable>(global.second));
 
         // Assign the attribute index global a valid index.
         // @note executionEngine->addGlobalMapping() can also be used if the indices
@@ -533,7 +534,7 @@ registerAccesses(const codegen::SymbolTable& globals, const AttributeRegistry& r
 
         llvm::GlobalVariable* variable =
             llvm::cast<llvm::GlobalVariable>(global.second);
-        assert(variable->getValueType()->isIntegerTy(64));
+        OPENVDB_ASSERT(variable->getValueType()->isIntegerTy(64));
 
         variable->setInitializer(llvm::ConstantInt::get(variable->getValueType(), index));
         variable->setConstant(true); // is not written to at runtime
@@ -583,7 +584,7 @@ registerExternalGlobals(const codegen::SymbolTable& globals,
             case ast::tokens::UNKNOWN :
             default      : {
                 // grammar guarantees this is unreachable as long as all types are supported
-                assert(false && "Attribute type unsupported or not recognised");
+                OPENVDB_ASSERT(false && "Attribute type unsupported or not recognised");
                 return nullptr;
             }
         }
@@ -604,10 +605,10 @@ registerExternalGlobals(const codegen::SymbolTable& globals,
         if (!dataPtr) dataPtr.reset(new CustomData);
 
         // should always be a GlobalVariable.
-        assert(llvm::isa<llvm::GlobalVariable>(global.second));
+        OPENVDB_ASSERT(llvm::isa<llvm::GlobalVariable>(global.second));
 
         llvm::GlobalVariable* variable = llvm::cast<llvm::GlobalVariable>(global.second);
-        assert(variable->getValueType() == codegen::LLVMType<uintptr_t>::get(C));
+        OPENVDB_ASSERT(variable->getValueType() == codegen::LLVMType<uintptr_t>::get(C));
 
         llvm::Constant* initializer = initializerFromToken(typetoken, name, *dataPtr);
 
@@ -704,7 +705,7 @@ Compiler::compile(const ast::Tree& tree,
 
     // if there has been a compilation error through user error, exit
     if (!attributes) {
-        assert(logger.hasError());
+        OPENVDB_ASSERT(logger.hasError());
         return nullptr;
     }
 
diff --git a/openvdb_ax/openvdb_ax/compiler/Compiler.h b/openvdb_ax/openvdb_ax/compiler/Compiler.h
index 6c4500a205..3f522278c8 100644
--- a/openvdb_ax/openvdb_ax/compiler/Compiler.h
+++ b/openvdb_ax/openvdb_ax/compiler/Compiler.h
@@ -23,6 +23,7 @@
 #include "openvdb_ax/ast/Parse.h"
 
 #include <openvdb/version.h>
+#include <openvdb/util/Assert.h>
 
 #include <memory>
 #include <sstream>
@@ -128,14 +129,14 @@ class OPENVDB_AX_API Compiler
             for (const auto& e : errors) os << e << "\n";
             OPENVDB_THROW(AXSyntaxError, os.str());
         }
-        assert(syntaxTree);
+        OPENVDB_ASSERT(syntaxTree);
         typename ExecutableT::Ptr exe = this->compile<ExecutableT>(*syntaxTree, logger, data);
         if (!errors.empty()) {
             std::ostringstream os;
             for (const auto& e : errors) os << e << "\n";
             OPENVDB_THROW(AXCompilerError, os.str());
         }
-        assert(exe);
+        OPENVDB_ASSERT(exe);
         return exe;
     }
 
@@ -163,7 +164,7 @@ class OPENVDB_AX_API Compiler
             for (const auto& e : errors) os << e << "\n";
             OPENVDB_THROW(AXCompilerError, os.str());
         }
-        assert(exe);
+        OPENVDB_ASSERT(exe);
         return exe;
     }
 
diff --git a/openvdb_ax/openvdb_ax/compiler/Logger.cc b/openvdb_ax/openvdb_ax/compiler/Logger.cc
index 46a83baf32..7b4c448797 100644
--- a/openvdb_ax/openvdb_ax/compiler/Logger.cc
+++ b/openvdb_ax/openvdb_ax/compiler/Logger.cc
@@ -5,6 +5,8 @@
 
 #include "Logger.h"
 
+#include <openvdb/util/Assert.h>
+
 #include <stack>
 
 namespace openvdb {
@@ -130,7 +132,7 @@ nodeToCodeLocation(const ast::Node* node,
                     <const ax::ast::Node*, Logger::CodeLocation>& map)
 {
     if (!tree) return Logger::CodeLocation(0,0);
-    assert(node);
+    OPENVDB_ASSERT(node);
     std::stack<size_t> pathStack = pathStackFromNode(node);
     const ast::Node* nodeInMap = nodeFromPathStack(pathStack, *tree);
     const auto locationIter = map.find(nodeInMap);
diff --git a/openvdb_ax/openvdb_ax/compiler/PointExecutable.cc b/openvdb_ax/openvdb_ax/compiler/PointExecutable.cc
index c84a12c9b3..b137f667a7 100644
--- a/openvdb_ax/openvdb_ax/compiler/PointExecutable.cc
+++ b/openvdb_ax/openvdb_ax/compiler/PointExecutable.cc
@@ -17,6 +17,7 @@
 #include "openvdb_ax/codegen/Codecs.h"
 
 #include <openvdb/Types.h>
+#include <openvdb/util/Assert.h>
 
 #include <openvdb/points/AttributeArray.h>
 #include <openvdb/points/PointAttribute.h>
@@ -55,7 +56,7 @@ struct PointExecutable::Settings
 
     inline std::vector<cli::ParamBase*> optional()
     {
-        assert(IsCLI);
+        OPENVDB_ASSERT(IsCLI);
         std::vector<cli::ParamBase*> params {
             &this->mCreateMissing,
             &this->mGroup,
@@ -331,7 +332,7 @@ struct PointFunctionArguments
         using FunctionTraitsT = codegen::PointKernelBufferRange::FunctionTraitsT;
         using ReturnT = FunctionTraitsT::ReturnType;
 
-        assert(mData.mUseBufferKernel);
+        OPENVDB_ASSERT(mData.mUseBufferKernel);
 
         return [&](const openvdb::Coord& origin, void* buffer, Index64* mask, const size_t size) -> ReturnT {
             mData.mKernelBufferRange(static_cast<FunctionTraitsT::Arg<0>::Type>(mData.mCustomData),
@@ -391,7 +392,7 @@ struct PointFunctionArguments
             const codegen::Codec* codec =
                 codegen::getCodec(ast::tokens::tokenFromTypeString(array.valueType()), array.codecType());
             if (codec) flag |= codec->flag();
-            assert(array.isDataLoaded() && !array.isUniform());
+            OPENVDB_ASSERT(array.isDataLoaded() && !array.isUniform());
         }
         else {
             typename WriteHandle<ValueT>::UniquePtr handle(new WriteHandle<ValueT>(leaf, Index(pos)));
@@ -404,14 +405,14 @@ struct PointFunctionArguments
 
     inline void addGroupHandle(const LeafT& leaf, const std::string& name)
     {
-        assert(leaf.attributeSet().descriptor().hasGroup(name));
+        OPENVDB_ASSERT(leaf.attributeSet().descriptor().hasGroup(name));
         mGroupHandles.emplace_back(new points::GroupHandle(leaf.groupHandle(name)));
         mVoidGroupHandles.emplace_back(static_cast<void*>(mGroupHandles.back().get()));
     }
 
     inline void addGroupWriteHandle(LeafT& leaf, const std::string& name)
     {
-        assert(leaf.attributeSet().descriptor().hasGroup(name));
+        OPENVDB_ASSERT(leaf.attributeSet().descriptor().hasGroup(name));
         mGroupHandles.emplace_back(new points::GroupWriteHandle(leaf.groupWriteHandle(name)));
         mVoidGroupHandles.emplace_back(static_cast<void*>(mGroupHandles.back().get()));
     }
@@ -422,7 +423,7 @@ struct PointFunctionArguments
     addAttributeHandle(LeafT& leaf, const std::string& name, const ast::tokens::CoreType type, const bool write)
     {
         // assert so the executer can be marked as noexcept (assuming nothing throws in compute)
-        assert(supported(type) && "Could not retrieve attribute handle from unsupported type");
+        OPENVDB_ASSERT(supported(type) && "Could not retrieve attribute handle from unsupported type");
         switch (type) {
             case ast::tokens::BOOL    : return this->addAttributeHandleTyped<bool>(leaf, name, write);
             case ast::tokens::CHAR    : return this->addAttributeHandleTyped<char>(leaf, name, write);
@@ -457,7 +458,7 @@ struct PointFunctionArguments
     {
         const size_t pos = leaf.attributeSet().find(name);
         //assert(!leaf.attributeSet().isShared(pos));
-        assert(pos != openvdb::points::AttributeSet::INVALID_POS);
+        OPENVDB_ASSERT(pos != openvdb::points::AttributeSet::INVALID_POS);
         if (write) this->addWriteHandle<ValueType>(leaf, pos);
         else       this->addHandle<ValueType>(leaf, pos);
     }
@@ -503,7 +504,7 @@ struct PointExecuterDeformer
     void apply(Vec3d& position, const IterT& iter) const
     {
         if (mFilter.valid(iter)) {
-            assert(mPws);
+            OPENVDB_ASSERT(mPws);
             position = Vec3d(mPws->get(*iter));
         }
     }
@@ -647,7 +648,7 @@ void processAttributes(points::PointDataGrid& grid,
                        Logger& logger)
 {
     const auto leafIter = grid.tree().cbeginLeaf();
-    assert(leafIter);
+    OPENVDB_ASSERT(leafIter);
 
     attributeInfo.reserve(registry.data().size());
 
@@ -682,7 +683,7 @@ void processAttributes(points::PointDataGrid& grid,
 
         if (pos != points::AttributeSet::INVALID_POS) {
             const points::AttributeArray* const array = leafIter->attributeSet().getConst(pos);
-            assert(array);
+            OPENVDB_ASSERT(array);
             if (array->stride() > 1) {
                 logger.warning("Attribute \"" + name + (name != iter.name() ? "\" [bound to \"" + iter.name() + "\"]" : "\"")
                     + " on grid \"" + grid.getName() + "\"is a strided (array) attribute. "
@@ -704,7 +705,7 @@ void processAttributes(points::PointDataGrid& grid,
             continue;
         }
 
-        assert(supported(iter.type()));
+        OPENVDB_ASSERT(supported(iter.type()));
         const NamePair type = typePairFromToken(iter.type());
         points::appendAttribute(grid.tree(), name, type);
     }
@@ -755,9 +756,9 @@ PointExecutable::PointExecutable(const std::shared_ptr<const llvm::LLVMContext>&
     , mFunctionAddresses(functions)
     , mSettings(new Settings<false>)
 {
-    assert(mContext);
-    assert(mExecutionEngine);
-    assert(mAttributeRegistry);
+    OPENVDB_ASSERT(mContext);
+    OPENVDB_ASSERT(mExecutionEngine);
+    OPENVDB_ASSERT(mAttributeRegistry);
 
     // parse the AST for known functions which require pre/post processing
     mSettings->mPostDelete = ast::callsFunction(ast, "deletepoint");
diff --git a/openvdb_ax/openvdb_ax/compiler/VolumeExecutable.cc b/openvdb_ax/openvdb_ax/compiler/VolumeExecutable.cc
index dd7607ff0b..8df087d2ce 100644
--- a/openvdb_ax/openvdb_ax/compiler/VolumeExecutable.cc
+++ b/openvdb_ax/openvdb_ax/compiler/VolumeExecutable.cc
@@ -22,6 +22,7 @@
 #include <openvdb/tree/ValueAccessor.h>
 #include <openvdb/tree/LeafManager.h>
 #include <openvdb/tree/NodeManager.h>
+#include <openvdb/util/Assert.h>
 
 #include <tbb/parallel_for.h>
 #include <tbb/task_group.h>
@@ -70,7 +71,7 @@ struct VolumeExecutable::Settings
 
     inline std::vector<cli::ParamBase*> optional()
     {
-        assert(IsCLI);
+        OPENVDB_ASSERT(IsCLI);
         std::vector<cli::ParamBase*> params {
             &this->mCreateMissing,
             &this->mTreeExecutionLevel,
@@ -275,7 +276,7 @@ inline openvdb::GridBase::Ptr
 createGrid(const ast::tokens::CoreType& type)
 {
     // assert so the executer can be marked as noexcept (assuming nothing throws in compute)
-    assert(supported(type) && "Could not retrieve accessor from unsupported type");
+    OPENVDB_ASSERT(supported(type) && "Could not retrieve accessor from unsupported type");
     switch (type) {
         case ast::tokens::BOOL    : return ConverterT<bool>::create();
         case ast::tokens::INT16   : return ConverterT<int16_t>::create();
@@ -426,9 +427,9 @@ struct VolumeFunctionArguments
     inline void
     addAccessor(openvdb::GridBase* grid, const ast::tokens::CoreType& type)
     {
-        assert(grid);
+        OPENVDB_ASSERT(grid);
         // assert so the executer can be marked as noexcept (assuming nothing throws in compute)
-        assert(supported(type) && "Could not retrieve accessor from unsupported type");
+        OPENVDB_ASSERT(supported(type) && "Could not retrieve accessor from unsupported type");
         switch (type) {
             case ast::tokens::BOOL    : { this->addAccessor(static_cast<ConverterT<bool>*>(grid)->tree()); return; }
             case ast::tokens::INT16   : { this->addAccessor(static_cast<ConverterT<int16_t>*>(grid)->tree()); return; }
@@ -517,7 +518,7 @@ struct VolumeExecuterOp
     {
         // if the current node level does not match, skip
         const Index level = node.getLevel();
-        assert(level > 0);
+        OPENVDB_ASSERT(level > 0);
         if (level < mData.mTreeLevelMin) return;
         if (level > mData.mTreeLevelMax) return;
 
@@ -526,7 +527,7 @@ struct VolumeExecuterOp
             // streaming ACTIVE tiles (this is an artificial limitation to stop
             // typical VDBs memory exploding when things like inactive root
             // node tiles are streamed).
-            assert((!std::is_same<ValueOffIter, IterT>::value));
+            OPENVDB_ASSERT((!std::is_same<ValueOffIter, IterT>::value));
             // Process ACTIVE values
             this->process(node);
 
@@ -563,7 +564,7 @@ struct VolumeExecuterOp
                 // Manually skip child topology (not-skipped by the ValueOff iterator)
                 if (std::is_same<ValueOffIter, IterT>::value &&
                     this->isChildMaskOn(node, it.pos())) continue;
-                assert(!this->isChildMaskOn(node, it.pos()));
+                OPENVDB_ASSERT(!this->isChildMaskOn(node, it.pos()));
                 kernel(it.getCoord());
             }
         }
@@ -782,7 +783,7 @@ struct VolumeExecuterOp
     void process(NodeT& parent) const
     {
         using ChildNodeT = typename NodeT::ChildNodeType;
-        assert((!std::is_same<ValueOffIter, IterT>::value));
+        OPENVDB_ASSERT((!std::is_same<ValueOffIter, IterT>::value));
 
         // Explicitly use a ValueOn Iterator (only stream ON Values)
         for (auto it = ValueOnIter::IterTraitsT<NodeT>::begin(parent); it; ++it) {
@@ -791,9 +792,9 @@ struct VolumeExecuterOp
             // ValueIter should never point to a child node - only time this is
             // possible is with a ValueOff iter, but this code only ever invoked
             // with a ValueOnIter
-            assert(!this->isChildMaskOn(parent, it.pos()));
+            OPENVDB_ASSERT(!this->isChildMaskOn(parent, it.pos()));
             // only processes active tiles
-            assert(it.isValueOn());
+            OPENVDB_ASSERT(it.isValueOn());
 
             ValueT _value = value;
             bool _active = true;
@@ -816,7 +817,7 @@ struct VolumeExecuterOp
         static_assert(ChildNodeT::DIM == LeafNodeT::DIM,
             "Expected the parent node type of LeafNodeT to have a "
             "CHILD_DIM equal to the DIM of a LeafNodeT.");
-        assert((!std::is_same<ValueOffIter, IterT>::value));
+        OPENVDB_ASSERT((!std::is_same<ValueOffIter, IterT>::value));
 
         // only process active tiles when streaming
         if (parent.getValueMask().isOff()) return;
@@ -872,7 +873,7 @@ struct VolumeExecuterOp
         std::vector<Tile>& tiles) const
     {
         // ValueOff iterators should explicitly disable tile streaming
-        assert((!std::is_same<ValueOffIter, IterT>::value));
+        OPENVDB_ASSERT((!std::is_same<ValueOffIter, IterT>::value));
         // @todo update to new InternalNode API methods when available
         auto* const table = const_cast<typename NodeT::UnionType*>(parent.getTable());
         const auto& mask = parent.getValueMask();
@@ -887,7 +888,7 @@ struct VolumeExecuterOp
         for (Index n = range.begin(), N = range.end(); n < N; ++n) {
             // explicitly only process active tiles when streaming
             if (!mask.isOn(n)) continue;
-            assert(!this->isChildMaskOn(parent, n));
+            OPENVDB_ASSERT(!this->isChildMaskOn(parent, n));
 
             const Coord& ijk = parent.offsetToGlobalCoord(n);
             const ValueT& value = table[n].getValue();
@@ -938,7 +939,7 @@ struct VolumeExecuterOp
         std::vector<Tile>& tiles) const
     {
         // ValueOff iterators should explicitly disable tile streaming
-        assert((!std::is_same<ValueOffIter, IterT>::value));
+        OPENVDB_ASSERT((!std::is_same<ValueOffIter, IterT>::value));
         using TempBufferT = typename std::conditional<
             std::is_same<std::string, ValueT>::value,
                 ax::codegen::String, bool>::type;
@@ -960,7 +961,7 @@ struct VolumeExecuterOp
         for (Index n = range.begin(), N = range.end(); n < N; ++n) {
             // explicitly only process active tiles when streaming
             if (!mask.isOn(n)) continue;
-            assert(!this->isChildMaskOn(parent, n));
+            OPENVDB_ASSERT(!this->isChildMaskOn(parent, n));
 
             const Coord& ijk = parent.offsetToGlobalCoord(n);
             const TempBufferT value = table[n].getValue();
@@ -1029,7 +1030,7 @@ registerVolumes(GridPtrVec& grids,
         const std::string& iterName = iter.name();
         const std::string* volumeNamePtr = nullptr;
         volumeNamePtr = bindings.isBoundAXName(iterName) ?  bindings.dataNameBoundTo(iterName) : &iterName;
-        assert(volumeNamePtr);
+        OPENVDB_ASSERT(volumeNamePtr);
         const std::string& volumeName = *volumeNamePtr;
         for (const auto& grid : grids) {
             if (grid->getName() != volumeName) continue;
@@ -1080,7 +1081,7 @@ registerVolumes(GridPtrVec& grids,
 
         if (iter.writes() && iter.affectsothers()) {
             // if affectsothers(), it's also read from at some point
-            assert(iter.reads());
+            OPENVDB_ASSERT(iter.reads());
             cache->addReadGrid(*matchedGrid, /*copy=*/true);
             cache->addWriteGrid(*matchedGrid);
         }
@@ -1103,7 +1104,7 @@ inline void run(GridT& grid, OpData& data, const VolumeExecutable& E)
     // Get the active index of the grid being executed
     const ast::tokens::CoreType type =
         ast::tokens::tokenFromTypeString(grid.valueType());
-    assert(data.mActiveIndex >= 0);
+    OPENVDB_ASSERT(data.mActiveIndex >= 0);
 
     // Set the active tile streaming behaviour for this grid if
     // the behaviour is set to AUTO (otherwise it's assigned the
@@ -1120,7 +1121,7 @@ inline void run(GridT& grid, OpData& data, const VolumeExecutable& E)
     const size_t g1 = E.getGrainSize();
     const size_t g2 = E.getActiveTileStreamingGrainSize();
     const bool threadOtherOps = g1 > 0 || g2 > 0;
-    assert(data.mTreeLevelMin <= data.mTreeLevelMax);
+    OPENVDB_ASSERT(data.mTreeLevelMin <= data.mTreeLevelMax);
 
     // Cache any existing leaf node pointers before doing any execution
     std::unique_ptr<tree::LeafManager<TreeType>> leafManager;
@@ -1183,7 +1184,7 @@ inline void run(GridCache& cache,
                 const VolumeExecutable& E,
                 Logger& logger)
 {
-    assert(cache.mRead.size() == registry.data().size());
+    OPENVDB_ASSERT(cache.mRead.size() == registry.data().size());
 
     // Initialize the shared op data
 
@@ -1212,7 +1213,7 @@ inline void run(GridCache& cache,
     openvdb::GridBase** read = cache.mRead.data();
     data.mVoidTransforms.reserve(cache.mRead.size());
     for (size_t i = 0; i < registry.data().size(); ++i, ++read) {
-        assert(read);
+        OPENVDB_ASSERT(read);
         data.mVoidTransforms.emplace_back(static_cast<void*>(&(*read)->transform()));
     }
 
@@ -1250,9 +1251,9 @@ VolumeExecutable::VolumeExecutable(const std::shared_ptr<const llvm::LLVMContext
     , mFunctionAddresses(functionAddresses)
     , mSettings(new Settings<false>)
 {
-    assert(mContext);
-    assert(mExecutionEngine);
-    assert(mAttributeRegistry);
+    OPENVDB_ASSERT(mContext);
+    OPENVDB_ASSERT(mExecutionEngine);
+    OPENVDB_ASSERT(mAttributeRegistry);
 
     // Determine if this kernel needs automatic streaming
 
@@ -1322,7 +1323,7 @@ void VolumeExecutable::execute(openvdb::GridPtrVec& grids) const
         run<ValueAllIter>(*cache, mFunctionAddresses, *mAttributeRegistry, mCustomData.get(), *mSettings, *this, *logger);
     }
     else {
-        assert(false && "Unrecognised voxel iterator.");
+        OPENVDB_ASSERT(false && "Unrecognised voxel iterator.");
     }
 }
 
@@ -1386,7 +1387,7 @@ VolumeExecutable::Streaming
 VolumeExecutable::getActiveTileStreaming(const std::string& name,
                      const ast::tokens::CoreType& type) const
 {
-    assert(mAttributeRegistry);
+    OPENVDB_ASSERT(mAttributeRegistry);
     if (mSettings->mActiveTileStreaming.get() == VolumeExecutable::Streaming::AUTO) {
         const ax::AttributeRegistry::AccessData* accessData =
             mAttributeRegistry->get(name, type);
diff --git a/openvdb_ax/openvdb_ax/grammar/axlexer.l b/openvdb_ax/openvdb_ax/grammar/axlexer.l
index 4082fb443e..4583f270f4 100644
--- a/openvdb_ax/openvdb_ax/grammar/axlexer.l
+++ b/openvdb_ax/openvdb_ax/grammar/axlexer.l
@@ -20,6 +20,7 @@ SPDX-License-Identifier: MPL-2.0
     #include "openvdb_ax/compiler/Logger.h"
     #include "axparser.h" /*generated by bison*/
     #include <openvdb/Platform.h>
+    #include <openvdb/util/Assert.h>
     #include <cstdlib>
     #include <errno.h>
 
@@ -35,7 +36,7 @@ SPDX-License-Identifier: MPL-2.0
     ///   is performed. Instead of manually tracking newlines, we
     ///   can simply scan for them in the current text held by axtext
     #define YY_USER_ACTION \
-        assert(axlog); \
+        OPENVDB_ASSERT(axlog); \
         axlloc.first_line = axlloc.last_line; \
         axlloc.first_column = axlloc.last_column; \
         for (int i = 0; axtext[i] != '\0'; i++) { \
@@ -347,7 +348,7 @@ COMMENT     "//".*
 .                           {
                                 /* error on everything else */
                                 /* @todo: move this into parser */
-                                assert(axlog);
+                                OPENVDB_ASSERT(axlog);
                                 axlog->error("stray or invalid character.",
                                         {axlloc.first_line, axlloc.first_column});
 
diff --git a/openvdb_ax/openvdb_ax/grammar/axparser.y b/openvdb_ax/openvdb_ax/grammar/axparser.y
index d2c8cc4fbf..db6bce3d9f 100644
--- a/openvdb_ax/openvdb_ax/grammar/axparser.y
+++ b/openvdb_ax/openvdb_ax/grammar/axparser.y
@@ -13,6 +13,7 @@
     #include "openvdb_ax/ast/Parse.h"
     #include "openvdb_ax/ast/Tokens.h"
     #include "openvdb_ax/compiler/Logger.h"
+    #include <openvdb/util/Assert.h>
     #include <vector>
 
     extern int axlex();
@@ -80,7 +81,7 @@
     template<typename T, typename... Args>
     T* newNode(AXLTYPE* loc, const Args&... args) {
         T* ptr = new T(args...);
-        assert(axlog);
+        OPENVDB_ASSERT(axlog);
         axlog->addNodeLocation(ptr, {loc->first_line, loc->first_column});
         return ptr;
     }
@@ -289,14 +290,14 @@ declaration_list:
                                                               free(const_cast<char*>($3));
                                                             }
     | declaration_list COMMA IDENTIFIER EQUALS expression   { const auto firstNode = $1->child(0);
-                                                              assert(firstNode);
+                                                              OPENVDB_ASSERT(firstNode);
                                                               const tokens::CoreType type = static_cast<const DeclareLocal*>(firstNode)->type();
                                                               $$->addStatement(newNode<DeclareLocal>(&@1, type, newNode<Local>(&@3, $3), $5));
                                                               $$ = $1;
                                                               free(const_cast<char*>($3));
                                                             }
     | declaration_list COMMA IDENTIFIER                     { const auto firstNode = $1->child(0);
-                                                              assert(firstNode);
+                                                              OPENVDB_ASSERT(firstNode);
                                                               const tokens::CoreType type =  static_cast<const DeclareLocal*>(firstNode)->type();
                                                               $$->addStatement(newNode<DeclareLocal>(&@1, type, newNode<Local>(&@3, $3)));
                                                               free(const_cast<char*>($3));
diff --git a/openvdb_ax/openvdb_ax/grammar/generated/axlexer.cc b/openvdb_ax/openvdb_ax/grammar/generated/axlexer.cc
index 27445b0e6a..be0ee87572 100644
--- a/openvdb_ax/openvdb_ax/grammar/generated/axlexer.cc
+++ b/openvdb_ax/openvdb_ax/grammar/generated/axlexer.cc
@@ -1067,6 +1067,7 @@ SPDX-License-Identifier: MPL-2.0
     #include "openvdb_ax/compiler/Logger.h"
     #include "axparser.h" /*generated by bison*/
     #include <openvdb/Platform.h>
+    #include <openvdb/util/Assert.h>
     #include <cstdlib>
     #include <errno.h>
 
@@ -1082,7 +1083,7 @@ SPDX-License-Identifier: MPL-2.0
     ///   is performed. Instead of manually tracking newlines, we
     ///   can simply scan for them in the current text held by axtext
     #define YY_USER_ACTION \
-        assert(axlog); \
+        OPENVDB_ASSERT(axlog); \
         axlloc.first_line = axlloc.last_line; \
         axlloc.first_column = axlloc.last_column; \
         for (int i = 0; axtext[i] != '\0'; i++) { \
@@ -2039,7 +2040,7 @@ YY_RULE_SETUP
 {
                                 /* error on everything else */
                                 /* @todo: move this into parser */
-                                assert(axlog);
+                                OPENVDB_ASSERT(axlog);
                                 axlog->error("stray or invalid character.",
                                         {axlloc.first_line, axlloc.first_column});
 
diff --git a/openvdb_ax/openvdb_ax/grammar/generated/axparser.cc b/openvdb_ax/openvdb_ax/grammar/generated/axparser.cc
index b17c55c686..b794ccd2dd 100644
--- a/openvdb_ax/openvdb_ax/grammar/generated/axparser.cc
+++ b/openvdb_ax/openvdb_ax/grammar/generated/axparser.cc
@@ -271,7 +271,7 @@ typedef enum yysymbol_kind_t yysymbol_kind_t;
     template<typename T, typename... Args>
     T* newNode(AXLTYPE* loc, const Args&... args) {
         T* ptr = new T(args...);
-        assert(axlog);
+        OPENVDB_ASSERT(axlog);
         axlog->addNodeLocation(ptr, {loc->first_line, loc->first_column});
         return ptr;
     }
@@ -2273,7 +2273,7 @@ yyparse (openvdb::ax::ast::Tree** tree)
 
   case 37: /* declaration_list: declaration_list COMMA IDENTIFIER EQUALS expression  */
                                                             { const auto firstNode = (yyvsp[-4].statementlist)->child(0);
-                                                              assert(firstNode);
+                                                              OPENVDB_ASSERT(firstNode);
                                                               const tokens::CoreType type = static_cast<const DeclareLocal*>(firstNode)->type();
                                                               (yyval.statementlist)->addStatement(newNode<DeclareLocal>(&(yylsp[-4]), type, newNode<Local>(&(yylsp[-2]), (yyvsp[-2].string)), (yyvsp[0].expression)));
                                                               (yyval.statementlist) = (yyvsp[-4].statementlist);
@@ -2283,7 +2283,7 @@ yyparse (openvdb::ax::ast::Tree** tree)
 
   case 38: /* declaration_list: declaration_list COMMA IDENTIFIER  */
                                                             { const auto firstNode = (yyvsp[-2].statementlist)->child(0);
-                                                              assert(firstNode);
+                                                              OPENVDB_ASSERT(firstNode);
                                                               const tokens::CoreType type =  static_cast<const DeclareLocal*>(firstNode)->type();
                                                               (yyval.statementlist)->addStatement(newNode<DeclareLocal>(&(yylsp[-2]), type, newNode<Local>(&(yylsp[0]), (yyvsp[0].string))));
                                                               free(const_cast<char*>((yyvsp[0].string)));
diff --git a/openvdb_ax/openvdb_ax/test/CMakeLists.txt b/openvdb_ax/openvdb_ax/test/CMakeLists.txt
index e403f716e0..3a72871cfe 100644
--- a/openvdb_ax/openvdb_ax/test/CMakeLists.txt
+++ b/openvdb_ax/openvdb_ax/test/CMakeLists.txt
@@ -39,7 +39,18 @@ else()
   set(OPENVDBAX_LIB openvdb_ax)
 endif()
 
+set(OPENVDB_AX_TEST_DEPENDENT_LIBS ${OPENVDBAX_LIB})
+
+if(CONCURRENT_MALLOC STREQUAL "Jemalloc")
+  find_package(Jemalloc REQUIRED)
+  list(APPEND OPENVDB_AX_TEST_DEPENDENT_LIBS Jemalloc::jemalloc)
+elseif(CONCURRENT_MALLOC STREQUAL "Tbbmalloc")
+  find_package(TBB ${MINIMUM_TBB_VERSION} REQUIRED COMPONENTS tbbmalloc)
+  list(APPEND OPENVDB_AX_TEST_DEPENDENT_LIBS TBB::tbbmalloc)
+endif()
+
 find_package(CppUnit ${MINIMUM_CPPUNIT_VERSION} REQUIRED)
+list(APPEND OPENVDB_AX_TEST_DEPENDENT_LIBS CppUnit::cppunit)
 
 set(TEST_SOURCE_FILES
   ast/TestScanners.cc
@@ -99,9 +110,7 @@ set(TEST_SOURCE_FILES
   main.cc
   )
 
-add_executable(vdb_ax_test
-  ${TEST_SOURCE_FILES}
-)
+add_executable(vdb_ax_test ${TEST_SOURCE_FILES})
 
 # @note  From Clang 14, fp-contract has been switched to ON by default. Docs in
 #  Clang versions before 14 state that fp-contract is ON, but it is in fact off.
@@ -119,13 +128,8 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "App
       COMPILE_OPTIONS "-ffp-contract=off")
 endif()
 
-target_link_libraries(vdb_ax_test
-  ${OPENVDBAX_LIB}
-  CppUnit::cppunit
-)
-target_include_directories(vdb_ax_test
-  PRIVATE ../ .
-)
+target_link_libraries(vdb_ax_test ${OPENVDB_AX_TEST_DEPENDENT_LIBS})
+target_include_directories(vdb_ax_test PRIVATE ../ .)
 
 if(OPENVDB_AX_TEST_PROFILE)
   target_compile_definitions(vdb_ax_test PRIVATE "-DPROFILE")
@@ -133,14 +137,16 @@ endif()
 
 add_test(NAME vdb_ax_unit_test COMMAND vdb_ax_test -v WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../)
 
-# For the undefined behaviour sanitizer, add the suppression file and
-# additional options
-
+# For the sanitizers, add the suppression files and additional options
 get_filename_component(PATH_TO_PROJECT_ROOT ${CMAKE_CURRENT_LIST_DIR} DIRECTORY)
 get_filename_component(PATH_TO_PROJECT_ROOT ${PATH_TO_PROJECT_ROOT} DIRECTORY)
 get_filename_component(PATH_TO_PROJECT_ROOT ${PATH_TO_PROJECT_ROOT} DIRECTORY)
+set(LSAN_SUPRESSION_FILE ${PATH_TO_PROJECT_ROOT}/cmake/scripts/lsan.supp)
 set(UBSAN_SUPRESSION_FILE ${PATH_TO_PROJECT_ROOT}/cmake/scripts/ubsan.supp)
 
+set(UBSAN_OPTS "$<$<CONFIG:UBSAN>:UBSAN_OPTIONS=halt_on_error=1 report_error_type=1 suppressions=${UBSAN_SUPRESSION_FILE}>")
+set(LSAN_OPTS  "$<$<CONFIG:LSAN>:LSAN_OPTIONS=suppressions=${LSAN_SUPRESSION_FILE}>")
+set(ASAN_OPTS  "$<$<CONFIG:ASAN>:LSAN_OPTIONS=suppressions=${LSAN_SUPRESSION_FILE}>")
+
 set_tests_properties(vdb_ax_unit_test PROPERTIES
-    ENVIRONMENT
-      "$<$<CONFIG:UBSAN>:UBSAN_OPTIONS=halt_on_error=1 report_error_type=1 suppressions=${UBSAN_SUPRESSION_FILE}>")
+    ENVIRONMENT "$<JOIN:${UBSAN_OPTS};${LSAN_OPTS};${ASAN_OPTS}, >")
diff --git a/openvdb_ax/openvdb_ax/test/backend/TestStringIR.cc b/openvdb_ax/openvdb_ax/test/backend/TestStringIR.cc
index cb4e7c0dab..78f29628e3 100644
--- a/openvdb_ax/openvdb_ax/test/backend/TestStringIR.cc
+++ b/openvdb_ax/openvdb_ax/test/backend/TestStringIR.cc
@@ -8,6 +8,8 @@
 #include <openvdb_ax/codegen/Functions.h>
 #include <openvdb_ax/codegen/FunctionRegistry.h>
 
+#include <openvdb/util/Assert.h>
+
 #include <cppunit/extensions/HelperMacros.h>
 
 #include <numeric> // iota
@@ -295,7 +297,7 @@ TestStringIR::testStringStringIR()
         // zero out the data held by a String object (expected to not hold heap memory).
         // This is used to test the IR methods work as expected with the allocated, but
         // uninitialized stack mem from the compute generator
-        assert(S.isLocal());
+        OPENVDB_ASSERT(S.isLocal());
         std::memset(&S, 0, sizeof(String)); // uninit string, invalid class memory
 #if defined(__GNUC__) && !defined(__clang__)
 #if OPENVDB_CHECK_GCC(8, 0)
diff --git a/openvdb_ax/openvdb_ax/test/backend/util.h b/openvdb_ax/openvdb_ax/test/backend/util.h
index e757cd3580..da2225af17 100644
--- a/openvdb_ax/openvdb_ax/test/backend/util.h
+++ b/openvdb_ax/openvdb_ax/test/backend/util.h
@@ -6,6 +6,8 @@
 
 #include <openvdb_ax/codegen/Types.h>
 
+#include <openvdb/util/Assert.h>
+
 #include <llvm/IR/LLVMContext.h>
 #include <llvm/IR/Module.h>
 #include <llvm/IR/IRBuilder.h>
@@ -64,7 +66,7 @@ struct LLVMState
                 .setMAttrs(features)
                 .create());
 
-        assert(EE.get());
+        OPENVDB_ASSERT(EE.get());
         return EE;
     }
 
diff --git a/openvdb_ax/openvdb_ax/test/integration/CompareGrids.cc b/openvdb_ax/openvdb_ax/test/integration/CompareGrids.cc
index 33a4d2c9d4..9333716df6 100644
--- a/openvdb_ax/openvdb_ax/test/integration/CompareGrids.cc
+++ b/openvdb_ax/openvdb_ax/test/integration/CompareGrids.cc
@@ -6,6 +6,8 @@
 #include "CompareGrids.h"
 
 #include <openvdb/points/PointDataGrid.h>
+#include <openvdb/util/Assert.h>
+
 #include <tbb/concurrent_vector.h>
 
 namespace unittest_util
@@ -131,7 +133,7 @@ struct NodeDD : public DiagnosticData
 
         const auto& l1 = g1.constTree().template probeConstNode<NodeT>(mOrigin);
         const auto& l2 = g2.constTree().template probeConstNode<NodeT>(mOrigin);
-        assert(l1 && l2);
+        OPENVDB_ASSERT(l1 && l2);
 
         os << "    Buffer Sizes : " <<  Local::fts(this->mBufferSizes) << std::endl;
 
@@ -232,7 +234,7 @@ inline bool compareNodes(const NodeT& firstLeaf,
 
     for (; iter; ++iter) {
         const openvdb::Index n = iter.pos();
-        assert(n < firstBuffer.size() && n < secondBuffer.size());
+        OPENVDB_ASSERT(n < firstBuffer.size() && n < secondBuffer.size());
 
         if (settings.mCheckActiveStates &&
             firstMask.isOn(n) ^ secondMask.isOn(n)) {
@@ -276,7 +278,7 @@ inline bool compareNodes(const NodeT& n1,
         }
 
         if (cmask1.isOn(n) && cmask2.isOn(n)) continue;
-        assert(vmask1.isOn(n) && vmask2.isOn(n));
+        OPENVDB_ASSERT(vmask1.isOn(n) && vmask2.isOn(n));
 
         if (settings.mCheckBufferValues &&
             !openvdb::math::isApproxEqual(t1[n].getValue(), t2[n].getValue(), tolerance)) {
@@ -483,7 +485,7 @@ struct CompareNodes
            data->mValid = false;
         }
         else {
-            assert(n1 && n2);
+            OPENVDB_ASSERT(n1 && n2);
             const typename MaskNodeT::NodeMaskType
                 mask(mUseVoxelMask ? node.getValueMask() : true);
             if (compareNodes(*n1, *n2, mask, *data, mSettings, mTolerance) &&
@@ -607,7 +609,7 @@ bool compareGrids(ComparisonResult& resultData,
     os << "[Diagnostic]: Leaf Node Diagnostics:\n"  << std::endl;
 
     for (const auto& diag : data) {
-        assert(diag);
+        OPENVDB_ASSERT(diag);
         diag->report(os, firstGrid, secondGrid, accessorTopology, accessorValues);
     }
 
diff --git a/openvdb_cmd/vdb_ax/cli.h b/openvdb_cmd/vdb_ax/cli.h
index 555c4f132e..1b80e5bfa0 100644
--- a/openvdb_cmd/vdb_ax/cli.h
+++ b/openvdb_cmd/vdb_ax/cli.h
@@ -14,8 +14,8 @@
 
 #include <openvdb/version.h>
 #include <openvdb_ax/Exceptions.h>
+#include <openvdb/util/Assert.h>
 
-#include <cassert>
 #include <cstring>
 #include <functional>
 #include <limits>
@@ -153,7 +153,7 @@ inline void usage(std::ostream& os,
         oswrap(os, doc, doclen, maxWidth, [&](size_t) { return indent; });
     }
     else {
-        assert(whitespace >= argGap);
+        OPENVDB_ASSERT(whitespace >= argGap);
         // space between name and docs
         for (int32_t i = 0; i < whitespace; ++i) os << ' ';
 
@@ -167,7 +167,7 @@ inline void usage(std::ostream& os,
             // skip space break (if found)
             if (*doc == ' ') { ++doc; --doclen; }
             os << '\n';
-            assert(doclen >= remain);
+            OPENVDB_ASSERT(doclen >= remain);
             doclen -= remain;
             oswrap(os, doc, doclen, maxWidth, [&](size_t) { return indent; });
         }
@@ -239,13 +239,13 @@ struct Param : public BasicParam<T>, ParamBase
     inline bool isInit() const override { return mInit; }
     inline void init(const char* arg, const uint32_t idx = 0) override
     {
-        assert((!arg && mCb1) || (arg && mCb2) || (arg && mCb3));
+        OPENVDB_ASSERT((!arg && mCb1) || (arg && mCb2) || (arg && mCb3));
         if (!arg) mCb1(BasicParam<T>::mValue);
         else if (mCb3 && this->acceptsIndex()) {
             mCb3(BasicParam<T>::mValue, arg, idx);
         }
         else {
-            assert(mCb2);
+            OPENVDB_ASSERT(mCb2);
             mCb2(BasicParam<T>::mValue, arg);
         }
         mInit = true;
@@ -329,8 +329,8 @@ struct ParamBuilder
         mParam.ParamBase::mOpts.clear();
     }
     ParamBuilder& addOpt(const char* opt) {
-        assert(opt);
-        assert(opt[0] == '-' || std::strchr(opt, ' ') == nullptr);
+        OPENVDB_ASSERT(opt);
+        OPENVDB_ASSERT(opt[0] == '-' || std::strchr(opt, ' ') == nullptr);
         mParam.ParamBase::mOpts.emplace_back(opt);
         return *this;
     }
@@ -340,7 +340,7 @@ struct ParamBuilder
     ParamBuilder& setCB(const typename ParamT::CB2 cb) { mParam.mCb2 = cb; return *this; }
     ParamBuilder& setCB(const typename ParamT::CB3 cb) { mParam.mCb3 = cb; return *this; }
     ParamT&& get() {
-        assert(!mParam.ParamBase::mOpts.empty());
+        OPENVDB_ASSERT(!mParam.ParamBase::mOpts.empty());
         if (!(mParam.mCb1 || mParam.mCb2 || mParam.mCb3)) {
             this->setCB(DefaultCallback<T>::get());
         }
diff --git a/openvdb_cmd/vdb_ax/main.cc b/openvdb_cmd/vdb_ax/main.cc
index 0dd402748e..f96b749486 100644
--- a/openvdb_cmd/vdb_ax/main.cc
+++ b/openvdb_cmd/vdb_ax/main.cc
@@ -27,6 +27,7 @@
 #include <openvdb/io/File.h>
 #include <openvdb/util/logging.h>
 #include <openvdb/util/CpuTimer.h>
+#include <openvdb/util/Assert.h>
 #include <openvdb/points/PointDelete.h>
 
 #include <tbb/global_control.h>
@@ -248,7 +249,7 @@ struct ProgOptions
                 if (v) OPENVDB_LOG_WARN("multiple code snippets provided, only using last input.");
                 if (idx == 0) v.reset(new std::string(arg));
                 else {
-                    assert(idx == 1);
+                    OPENVDB_ASSERT(idx == 1);
                     v.reset(new std::string());
                     loadSnippetFile(arg, *v);
                 }
@@ -827,7 +828,7 @@ main(int argc, char *argv[])
 
         if (printing) {
             axlog("Querying available functions\n" << std::flush);
-            assert(opts.mFunctionNamesOnly.get() || initializer.isInitialized());
+            OPENVDB_ASSERT(opts.mFunctionNamesOnly.get() || initializer.isInitialized());
             printFunctions(opts.mFunctionNamesOnly.get(),
                 opts.mFunctionList.get().second,
                 std::cout);
@@ -883,7 +884,7 @@ main(int argc, char *argv[])
         }
     }
 
-    assert(initCompile);
+    OPENVDB_ASSERT(initCompile);
 
     std::ostringstream tmp;
     openvdb::ax::cli::ParamToStream(tmp, opts.mOptLevel.get());
diff --git a/openvdb_cmd/vdb_tool/include/Geometry.h b/openvdb_cmd/vdb_tool/include/Geometry.h
index 476e90f83a..07c7f44743 100644
--- a/openvdb_cmd/vdb_tool/include/Geometry.h
+++ b/openvdb_cmd/vdb_tool/include/Geometry.h
@@ -29,6 +29,7 @@
 
 #include <openvdb/openvdb.h>
 #include <openvdb/points/PointCount.h>
+#include <openvdb/util/Assert.h>
 
 #ifdef VDB_TOOL_USE_NANO
 #include <nanovdb/NanoVDB.h>
@@ -709,7 +710,7 @@ void Geometry::readVDB(const std::string &fileName)
     for (auto m : *meta) {
         if (m->isType<points::PointDataGrid>()) {
             auto grid = gridPtrCast<points::PointDataGrid>(file.readGrid(m->getName()));
-            assert(grid);
+            OPENVDB_ASSERT(grid);
             size_t n = mVtx.size();
             const auto m = points::pointCount(grid->tree());
             mVtx.resize(n + m);
@@ -788,7 +789,7 @@ void Geometry::readSTL(const std::string &fileName)
                 while(std::getline(infile, line)) {// loop over vertices of the facet
                     tmp = trim(line, " ");
                     if (tmp.compare(0, 7, "endloop")==0) break;
-                    assert(tmp.compare(0, 6, "vertex")==0);
+                    OPENVDB_ASSERT(tmp.compare(0, 6, "vertex")==0);
                     iss.clear();
                     iss.str(tmp.substr(6));
                     if (iss >> xyz[0] >> xyz[1] >> xyz[2]) {
diff --git a/openvdb_cmd/vdb_tool/include/Parser.h b/openvdb_cmd/vdb_tool/include/Parser.h
index 8d3e6bca24..80245e5db3 100644
--- a/openvdb_cmd/vdb_tool/include/Parser.h
+++ b/openvdb_cmd/vdb_tool/include/Parser.h
@@ -32,6 +32,7 @@
 #include <stdio.h>
 
 #include <openvdb/openvdb.h>
+#include <openvdb/util/Assert.h>
 
 #include "Util.h"
 
@@ -805,7 +806,7 @@ Parser::Parser(std::vector<Option> &&def)
          {"help", "", "*|+,-,...", "print a list of all or specified list operations each with brief documentation"}},
         [](){},
         [&](){
-            assert(iter->name == "eval");
+            OPENVDB_ASSERT(iter->name == "eval");
             if (!iter->options[1].value.empty()) {
                 if (iter->options[1].value=="*") {
                     processor.help();
@@ -839,7 +840,7 @@ Parser::Parser(std::vector<Option> &&def)
         std::vector<Option>(defaults), // using std::move produces error: moving a temporary object prevents copy elision
         [&](){assert(iter->name == "default");
               std::vector<Option> &src = iter->options, &dst = defaults;
-              assert(src.size() == dst.size());
+              OPENVDB_ASSERT(src.size() == dst.size());
               for (size_t i=0; i<src.size(); ++i) if (!src[i].value.empty()) dst[i].value = src[i].value;},
         [](){}
     );
@@ -857,7 +858,7 @@ Parser::Parser(std::vector<Option> &&def)
                 i += 1;
             }
         }
-        assert(it->name == "end");
+        OPENVDB_ASSERT(it->name == "end");
     };
 
     this->addAction(
@@ -865,7 +866,7 @@ Parser::Parser(std::vector<Option> &&def)
         {{"", "", "i=0,9|i=0,9,2", "define name of loop variable and its range."}},
         [&](){++counter;},
         [&](){
-            assert(iter->name == "for");
+            OPENVDB_ASSERT(iter->name == "for");
             const std::string &name = iter->options[0].name;
             std::shared_ptr<BaseLoop> loop;
             try {
@@ -887,7 +888,7 @@ Parser::Parser(std::vector<Option> &&def)
         {{"", "", "s=sphere,bunny,...", "defined name of loop variable and list of its values."}},
         [&](){++counter;},
         [&](){
-            assert(iter->name == "each");
+            OPENVDB_ASSERT(iter->name == "each");
             const std::string &name = iter->options[0].name;
             auto loop = std::make_shared<EachLoop>(processor.memory(), iter, name, this->getVec<std::string>(name,","));
             if (loop->valid()) {
@@ -904,7 +905,7 @@ Parser::Parser(std::vector<Option> &&def)
         {{"test", "", "0|1|false|true", "boolean value used to test if-statement"}},
         [&](){++counter;},
         [&](){
-            assert(iter->name == "if");
+            OPENVDB_ASSERT(iter->name == "if");
             if (this->get<bool>("test")) {
                 loops.push_back(std::make_shared<IfLoop>(processor.memory(), iter));
             } else {
@@ -919,7 +920,7 @@ Parser::Parser(std::vector<Option> &&def)
             if (counter<=0) throw std::invalid_argument("Parser: -end must be preceeded by -for,-each, or -if");
             --counter;},
         [&](){
-            assert(iter->name == "end");
+            OPENVDB_ASSERT(iter->name == "end");
             auto loop = loops.back();// current loop
             if (loop->next()) {// rewind loop
                 iter = loop->begin;
@@ -955,7 +956,7 @@ void Parser::finalize()
 
 void Parser::parse(int argc, char *argv[])
 {
-    assert(!hashMap.empty());
+    OPENVDB_ASSERT(!hashMap.empty());
     if (argc <= 1) throw std::invalid_argument("Parser: No arguments provided, try " + getFile(argv[0]) + " -help\"");
     counter = 0;// reset to check for matching {for,each}/end loops
     for (int i=1; i<argc; ++i) {
diff --git a/openvdb_cmd/vdb_tool/include/Tool.h b/openvdb_cmd/vdb_tool/include/Tool.h
index b397811a6d..4c27b32e32 100644
--- a/openvdb_cmd/vdb_tool/include/Tool.h
+++ b/openvdb_cmd/vdb_tool/include/Tool.h
@@ -22,6 +22,7 @@
 #include <openvdb/io/Stream.h>
 #include <openvdb/util/CpuTimer.h>
 #include <openvdb/util/Formats.h>
+#include <openvdb/util/Assert.h>
 #include <openvdb/tools/Composite.h>
 #include <openvdb/tools/FastSweeping.h>
 #include <openvdb/tools/LevelSetAdvect.h>
@@ -868,7 +869,7 @@ void Tool::init()
 void Tool::help()
 {
   const std::string &name = mParser.getAction().name;
-  assert(name == "help");
+  OPENVDB_ASSERT(name == "help");
   try {
     mParser.printAction();
     const VecS actions = mParser.getVec<std::string>("actions");
@@ -929,7 +930,7 @@ std::string Tool::examples() const
 
 void Tool::clear()
 {
-  assert(mParser.getAction().name == "clear");
+  OPENVDB_ASSERT(mParser.getAction().name == "clear");
   if (mParser.get<std::string>("geo") == "*") {
     mGeom.clear();
   } else {
@@ -955,7 +956,7 @@ void Tool::clear()
 
 void Tool::read()
 {
-  assert(mParser.getAction().name == "read");
+  OPENVDB_ASSERT(mParser.getAction().name == "read");
   for (auto &fileName : mParser.getVec<std::string>("files")) {
     switch (findFileExt(fileName, {"geo,obj,ply,abc,pts,stl", "vdb", "nvdb"})) {
     case 1:
@@ -978,7 +979,7 @@ void Tool::read()
 
 void Tool::readGeo(const std::string &fileName)
 {
-  assert(mParser.getAction().name == "read");
+  OPENVDB_ASSERT(mParser.getAction().name == "read");
   if (mParser.verbose>1) std::cerr << "Reading geometry from \"" << fileName << "\"\n";
   if (mParser.verbose) mTimer.start("Read geometry");
   Geometry::Ptr geom(new Geometry());
@@ -997,7 +998,7 @@ void Tool::readGeo(const std::string &fileName)
 
 void Tool::readVDB(const std::string &fileName)
 {
-  assert(mParser.getAction().name == "read");
+  OPENVDB_ASSERT(mParser.getAction().name == "read");
   const VecS gridNames = mParser.getVec<std::string>("grids");
   if (gridNames.empty()) throw std::invalid_argument("readVDB: no grids names specified");
   GridPtrVecPtr grids;
@@ -1032,7 +1033,7 @@ void Tool::readVDB(const std::string &fileName)
 #ifdef VDB_TOOL_USE_NANO
 void Tool::readNVDB(const std::string &fileName)
 {
-  assert(mParser.getAction().name == "read");
+  OPENVDB_ASSERT(mParser.getAction().name == "read");
   const VecS gridNames = mParser.getVec<std::string>("grids");
   if (gridNames.empty()) throw std::invalid_argument("readNVDB: no grids names specified");
   std::vector<nanovdb::GridHandle<>> grids;
@@ -1070,7 +1071,7 @@ void Tool::readNVDB(const std::string&)
 
 void Tool::config()
 {
-    assert(mParser.getAction().name == "config");
+    OPENVDB_ASSERT(mParser.getAction().name == "config");
     const bool update  = mParser.get<bool>("update");
     const bool execute = mParser.get<bool>("execute");
     std::string line;
@@ -1119,7 +1120,7 @@ void Tool::config()
 
 void Tool::write()
 {
-  assert(mParser.getAction().name == "write");
+  OPENVDB_ASSERT(mParser.getAction().name == "write");
   for (std::string &fileName : mParser.getVec<std::string>("files")) {
     switch (findFileExt(fileName, {"geo,obj,ply,stl,abc", "vdb", "nvdb", "txt"})) {
     case 1:
@@ -1146,7 +1147,7 @@ void Tool::write()
 void Tool::writeVDB(const std::string &fileName)
 {
   const std::string &name = mParser.getAction().name;
-  assert(name == "write");
+  OPENVDB_ASSERT(name == "write");
   try {
     mParser.printAction();
     const std::string age = mParser.get<std::string>("vdb");
@@ -1212,7 +1213,7 @@ void Tool::writeVDB(const std::string &fileName)
 void Tool::writeNVDB(const std::string &fileName)
 {
   const std::string &name = mParser.getAction().name;
-  assert(name == "write");
+  OPENVDB_ASSERT(name == "write");
   try {
     mParser.printAction();
     const std::string age = mParser.get<std::string>("vdb");
@@ -1336,7 +1337,7 @@ void Tool::writeNVDB(const std::string&)
 
 void Tool::writeGeo(const std::string &fileName)
 {
-  assert(mParser.getAction().name == "write");
+  OPENVDB_ASSERT(mParser.getAction().name == "write");
   const int age = mParser.get<int>("geo");
   const bool keep = mParser.get<bool>("keep");
   if (mParser.verbose>1) std::cerr << "Writing geometry to \"" << fileName << "\"\n";
@@ -1352,7 +1353,7 @@ void Tool::writeGeo(const std::string &fileName)
 
 void Tool::writeConf(const std::string &fileName)
 {
-  assert(mParser.getAction().name == "write");
+  OPENVDB_ASSERT(mParser.getAction().name == "write");
   if (mParser.verbose>1) std::cerr << "Writing configuration to \"" << fileName << "\"\n";
   std::ofstream file(fileName);
   if (!file.is_open()) throw std::invalid_argument("writeConf: unable to open \""+fileName+"\"");
@@ -1369,7 +1370,7 @@ void Tool::writeConf(const std::string &fileName)
 void Tool::vdbToPoints()
 {
   const std::string &name = mParser.getAction().name;
-  assert(name == "vdb2points");
+  OPENVDB_ASSERT(name == "vdb2points");
   try {
     mParser.printAction();
     const int age = mParser.get<int>("vdb");
@@ -1411,7 +1412,7 @@ void Tool::vdbToPoints()
 void Tool::pointsToVdb()
 {
   const std::string &name = mParser.getAction().name;
-  assert(name == "points2vdb");
+  OPENVDB_ASSERT(name == "points2vdb");
   try {
     mParser.printAction();
     const int age = mParser.get<int>("geo");
@@ -1455,7 +1456,7 @@ void Tool::pointsToVdb()
 void Tool::transform()
 {
   const std::string &name = mParser.getAction().name;
-  assert(name == "transform");
+  OPENVDB_ASSERT(name == "transform");
   try {
     mParser.printAction();
     const auto vdb_age = mParser.getVec<int>("vdb");
@@ -1514,7 +1515,7 @@ void Tool::transform()
 void Tool::levelSetToFog()
 {
   const std::string &name = mParser.getAction().name;
-  assert(name == "ls2fog");
+  OPENVDB_ASSERT(name == "ls2fog");
   try {
     mParser.printAction();
     const int age = mParser.get<int>("vdb");
@@ -1541,7 +1542,7 @@ void Tool::levelSetToFog()
 void Tool::isoToLevelSet()
 {
   const std::string &name = mParser.getAction().name;
-  assert(name == "iso2ls");
+  OPENVDB_ASSERT(name == "iso2ls");
   try {
     mParser.printAction();
     const VecI age = mParser.getVec<int>("vdb");
@@ -1593,7 +1594,7 @@ float Tool::estimateVoxelSize(int maxDim,  float halfWidth, int geo_age)
 void Tool::meshToLevelSet()
 {
   const std::string &name = mParser.getAction().name;
-  assert(name == "mesh2ls");
+  OPENVDB_ASSERT(name == "mesh2ls");
   try {
     mParser.printAction();
     const int dim = mParser.get<int>("dim");
@@ -1632,7 +1633,7 @@ void Tool::meshToLevelSet()
 void Tool::particlesToLevelSet()
 {
   const std::string &name = mParser.getAction().name;
-  assert(name == "points2ls");
+  OPENVDB_ASSERT(name == "points2ls");
   try {
     mParser.printAction();
     const int dim = mParser.get<int>("dim");
@@ -1707,7 +1708,7 @@ typename Tool::FilterT Tool::createFilter(GridT &grid, int space, int time)
 void Tool::offsetLevelSet()
 {
   const std::string &name = mParser.getAction().name;
-  assert(findMatch(name, {"dilate", "erode", "open", "close"}));
+  OPENVDB_ASSERT(findMatch(name, {"dilate", "erode", "open", "close"}));
   try {
     mParser.printAction();
     float radius = mParser.get<float>("radius");
@@ -1750,7 +1751,7 @@ void Tool::offsetLevelSet()
 void Tool::filterLevelSet()
 {
   const std::string &name = mParser.getAction().name;
-  assert(findMatch(name, {"gauss", "mean", "median"}));
+  OPENVDB_ASSERT(findMatch(name, {"gauss", "mean", "median"}));
   try {
     mParser.printAction();
     const int nIter = mParser.get<int>("iter");
@@ -1789,7 +1790,7 @@ void Tool::filterLevelSet()
 void Tool::pruneLevelSet()
 {
   const std::string &name = mParser.getAction().name;
-  assert(name == "prune");
+  OPENVDB_ASSERT(name == "prune");
   try {
     mParser.printAction();
     const int age = mParser.get<int>("vdb");
@@ -1810,7 +1811,7 @@ void Tool::pruneLevelSet()
 void Tool::floodLevelSet()
 {
   const std::string &name = mParser.getAction().name;
-  assert(name == "flood");
+  OPENVDB_ASSERT(name == "flood");
   try {
     mParser.printAction();
     const int age = mParser.get<int>("vdb");
@@ -1831,7 +1832,7 @@ void Tool::floodLevelSet()
 void Tool::compute()
 {
   const std::string &name = mParser.getAction().name;
-  assert(findMatch(name, {"cpt","div","curl","length","grad","curvature"}));
+  OPENVDB_ASSERT(findMatch(name, {"cpt","div","curl","length","grad","curvature"}));
   try {
     mParser.printAction();
     const int age = mParser.get<int>("vdb");
@@ -1899,7 +1900,7 @@ void Tool::compute()
 void Tool::composite()
 {
   const std::string &name = mParser.getAction().name;
-  assert(findMatch(name, {"min","max","sum"}));
+  OPENVDB_ASSERT(findMatch(name, {"min","max","sum"}));
   try {
     mParser.printAction();
     const VecI ij = mParser.getVec<int>("vdb");
@@ -1944,7 +1945,7 @@ void Tool::composite()
 void Tool::csg()
 {
   const std::string &name = mParser.getAction().name;
-  assert(findMatch(name, {"union", "intersection", "difference"}));
+  OPENVDB_ASSERT(findMatch(name, {"union", "intersection", "difference"}));
   try {
     mParser.printAction();
     const VecI ij = mParser.getVec<int>("vdb");
@@ -2021,7 +2022,7 @@ void Tool::csg()
 void Tool::levelSetToMesh()
 {
   const std::string &action_name = mParser.getAction().name;
-  assert(action_name == "ls2mesh");
+  OPENVDB_ASSERT(action_name == "ls2mesh");
   try {
     mParser.printAction();
     const double adaptivity = mParser.get<float>("adapt");
@@ -2098,7 +2099,7 @@ void Tool::levelSetToMesh()
 void Tool::levelSetSphere()
 {
   const std::string &name = mParser.getAction().name;
-  assert(name == "sphere");
+  OPENVDB_ASSERT(name == "sphere");
   try {
     mParser.printAction();
     const int dim = mParser.get<int>("dim");
@@ -2123,7 +2124,7 @@ void Tool::levelSetSphere()
 void Tool::levelSetPlatonic()
 {
   const std::string &name = mParser.getAction().name;
-  assert(name == "platonic");
+  OPENVDB_ASSERT(name == "platonic");
   try {
     mParser.printAction();
     const int dim = mParser.get<int>("dim");
@@ -2162,7 +2163,7 @@ void Tool::levelSetPlatonic()
 void Tool::multires()
 {
   const std::string &name = mParser.getAction().name;
-  assert(name == "multires");
+  OPENVDB_ASSERT(name == "multires");
   try {
     mParser.printAction();
     const int levels = mParser.get<int>("levels");
@@ -2191,7 +2192,7 @@ void Tool::multires()
 void Tool::expandLevelSet()
 {
   const std::string &name = mParser.getAction().name;
-  assert(name == "expand");
+  OPENVDB_ASSERT(name == "expand");
   try {
     mParser.printAction();
     const int dilate = mParser.get<int>("dilate");
@@ -2217,7 +2218,7 @@ void Tool::expandLevelSet()
 void Tool::segment()
 {
   const std::string &name = mParser.getAction().name;
-  assert(name == "segment");
+  OPENVDB_ASSERT(name == "segment");
   try {
     mParser.printAction();
     const int age = mParser.get<int>("vdb");
@@ -2250,7 +2251,7 @@ void Tool::segment()
 void Tool::resample()
 {
   const std::string &name = mParser.getAction().name;
-  assert(name == "resample");
+  OPENVDB_ASSERT(name == "resample");
   try {
     mParser.printAction();
     const VecI age = mParser.getVec<int>("vdb");
@@ -2303,7 +2304,7 @@ void Tool::resample()
 void Tool::scatter()
 {
   const std::string &name = mParser.getAction().name;
-  assert(name == "scatter");
+  OPENVDB_ASSERT(name == "scatter");
   try {
     mParser.printAction();
     const Index64 count = mParser.get<int>("count");
@@ -2355,7 +2356,7 @@ void Tool::scatter()
 void Tool::enright()
 {
   const std::string &name = mParser.getAction().name;
-  assert(name == "enright");
+  OPENVDB_ASSERT(name == "enright");
   try {
     mParser.printAction();
     const Vec3d translate = mParser.getVec3<double>("translate");
@@ -2447,7 +2448,7 @@ GridBase::Ptr Tool::clip(const VecF &v, int age, const GridType &input)
 void Tool::clip()
 {
   const std::string &name = mParser.getAction().name;
-  assert(name == "clip");
+  OPENVDB_ASSERT(name == "clip");
   try {
     mParser.printAction();
     const int age = mParser.get<int>("vdb");
@@ -2621,7 +2622,7 @@ void saveEXR(const std::string&, const tools::Film&, const std::string& = "zip")
 
 void Tool::render()
 {
-  assert(mParser.getAction().name == "render");
+  OPENVDB_ASSERT(mParser.getAction().name == "render");
   const VecS fileNames = mParser.getVec<std::string>("files");
   const int age = mParser.get<int>("vdb");
   const bool keep = mParser.get<bool>("keep");
@@ -2783,7 +2784,7 @@ void Tool::print_args(std::ostream& os) const
 
 void Tool::print(std::ostream& os) const
 {
-  assert(mParser.getAction().name == "print");
+  OPENVDB_ASSERT(mParser.getAction().name == "print");
 
   if (mParser.verbose>1) {
     os << "\n" << std::setw(40) << std::setfill('=') << "> Actions <" << std::setw(40) << "\n";
diff --git a/openvdb_cmd/vdb_view/RenderModules.cc b/openvdb_cmd/vdb_view/RenderModules.cc
index 72acd768fd..273df8e33c 100644
--- a/openvdb_cmd/vdb_view/RenderModules.cc
+++ b/openvdb_cmd/vdb_view/RenderModules.cc
@@ -11,6 +11,7 @@
 #include <openvdb/tools/Prune.h>
 #include <openvdb/tree/LeafManager.h>
 #include <openvdb/util/logging.h>
+#include <openvdb/util/Assert.h>
 #include <algorithm> // for std::min()
 #include <cmath> // for std::abs(), std::fabs(), std::floor()
 #include <limits>
@@ -332,7 +333,7 @@ BufferObject::genIndexBuffer(const std::vector<GLuint>& v, GLenum primType)
 void
 BufferObject::genVertexBuffer(const std::vector<GLfloat>& v)
 {
-    assert((v.size() % 3) == 0);
+    OPENVDB_ASSERT((v.size() % 3) == 0);
     if (glIsBuffer(mVertexBuffer) == GL_TRUE) glDeleteBuffers(1, &mVertexBuffer);
 
     glGenBuffers(1, &mVertexBuffer);
diff --git a/openvdb_houdini/openvdb_houdini/AXUtils.h b/openvdb_houdini/openvdb_houdini/AXUtils.h
index 511fe58c02..22a37f7dbc 100644
--- a/openvdb_houdini/openvdb_houdini/AXUtils.h
+++ b/openvdb_houdini/openvdb_houdini/AXUtils.h
@@ -394,7 +394,7 @@ hax_chramp(const openvdb::ax::FunctionOptions& op)
            const void* const data)
     {
         const openvdb::ax::CustomData* const customData =
-            static_cast<const openvdb::ax::CustomData* const>(data);
+            static_cast<const openvdb::ax::CustomData*>(data);
         const std::string nameString(name);
 
         const openvdb::Metadata::ConstPtr& meta = customData->getData(nameString);
diff --git a/openvdb_houdini/openvdb_houdini/CMakeLists.txt b/openvdb_houdini/openvdb_houdini/CMakeLists.txt
index e12e84d083..4a52446380 100644
--- a/openvdb_houdini/openvdb_houdini/CMakeLists.txt
+++ b/openvdb_houdini/openvdb_houdini/CMakeLists.txt
@@ -160,10 +160,8 @@ set(OPENVDB_HOUDINI_LOCAL_DIR ${PROJECT_BINARY_DIR}/openvdb_houdini)
 file(MAKE_DIRECTORY ${OPENVDB_HOUDINI_LOCAL_DIR})
 file(COPY
     AttributeTransferUtil.h
-    GEO_PrimVDB.h
     GeometryUtil.h
     GT_GEOPrimCollectVDB.h
-    GU_PrimVDB.h
     GU_VDBPointTools.h
     PointUtils.h
     SOP_NodeVDB.h
@@ -176,12 +174,10 @@ file(COPY
 )
 
 add_library(openvdb_houdini SHARED
-  GEO_PrimVDB.cc
   GEO_VDBTranslator.cc
   geometry.cc
   GeometryUtil.cc
   GT_GEOPrimCollectVDB.cc
-  GU_PrimVDB.cc
   GU_VDBPointTools.cc
   ParmFactory.cc
   PointUtils.cc
@@ -308,6 +304,12 @@ foreach(DSO_NAME ${OPENVDB_DSO_NAMES})
   add_library(${DSO_NAME} SHARED ${DSO_NAME}.cc)
   target_link_libraries(${DSO_NAME} PUBLIC openvdb_houdini)
 
+  if(OPENVDB_ENABLE_RPATH)
+    set_target_properties(${DSO_NAME}
+      PROPERTIES INSTALL_RPATH
+        "${CMAKE_INSTALL_RPATH};${OPENVDB_HOUDINI_BIN_INSTALL_PREFIX};${OPENVDB_HOUDINI_LIB_INSTALL_PREFIX}")
+  endif()
+
   # Call houdini_configure_target to setup the sesi tag information. We set
   # INSTDIR to the project build directory and configure the actuall install
   # paths ourselves (otherwise cmake will configure the build to build directly
diff --git a/openvdb_houdini/openvdb_houdini/GEO_VDBTranslator.cc b/openvdb_houdini/openvdb_houdini/GEO_VDBTranslator.cc
index e54bd5a427..3f860f7f14 100644
--- a/openvdb_houdini/openvdb_houdini/GEO_VDBTranslator.cc
+++ b/openvdb_houdini/openvdb_houdini/GEO_VDBTranslator.cc
@@ -6,7 +6,7 @@
  *      Side Effects Software Inc.  All rights reserved.
  */
 
-#include "GU_PrimVDB.h"
+#include <GU/GU_PrimVDB.h>
 #include "Utils.h"
 
 #include <UT/UT_EnvControl.h>
diff --git a/openvdb_houdini/openvdb_houdini/GT_GEOPrimCollectVDB.cc b/openvdb_houdini/openvdb_houdini/GT_GEOPrimCollectVDB.cc
index 52a5d14bab..db4a006285 100644
--- a/openvdb_houdini/openvdb_houdini/GT_GEOPrimCollectVDB.cc
+++ b/openvdb_houdini/openvdb_houdini/GT_GEOPrimCollectVDB.cc
@@ -26,7 +26,7 @@
 #include <GT/GT_PrimCurveMesh.h>
 #include <GU/GU_DetailHandle.h>
 
-#include "GEO_PrimVDB.h"
+#include <GEO/GEO_PrimVDB.h>
 #include <UT/UT_ParallelUtil.h>
 #include <UT/UT_Vector3.h>
 #include <UT/UT_Version.h>
diff --git a/openvdb_houdini/openvdb_houdini/GeometryUtil.h b/openvdb_houdini/openvdb_houdini/GeometryUtil.h
index 5942592c35..49928e68b3 100644
--- a/openvdb_houdini/openvdb_houdini/GeometryUtil.h
+++ b/openvdb_houdini/openvdb_houdini/GeometryUtil.h
@@ -15,6 +15,7 @@
 #include <openvdb/util/Util.h> // for openvdb::util::COORD_OFFSETS
 
 #include <GU/GU_Detail.h>
+#include <GEO/GEO_Primitive.h>
 
 #include <algorithm> // for std::max/min()
 #include <memory>
diff --git a/openvdb_houdini/openvdb_houdini/SOP_NodeVDB.cc b/openvdb_houdini/openvdb_houdini/SOP_NodeVDB.cc
index ca640781ab..5385394764 100644
--- a/openvdb_houdini/openvdb_houdini/SOP_NodeVDB.cc
+++ b/openvdb_houdini/openvdb_houdini/SOP_NodeVDB.cc
@@ -10,8 +10,8 @@
 #include <openvdb/points/PointDataGrid.h>
 #include "PointUtils.h"
 #include "Utils.h"
-#include "GEO_PrimVDB.h"
-#include "GU_PrimVDB.h"
+#include <GEO/GEO_PrimVDB.h>
+#include <GU/GU_PrimVDB.h>
 #include <GU/GU_Detail.h>
 #include <GU/GU_PrimPoly.h>
 #include <OP/OP_NodeInfoParms.h>
diff --git a/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Activate.cc b/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Activate.cc
index b53e015a01..6346648c29 100644
--- a/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Activate.cc
+++ b/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Activate.cc
@@ -529,8 +529,6 @@ class sop_FillSDFOp
     template <typename NodeT>
     bool operator()(NodeT &node, size_t idx) const
     {
-        using ChildT = typename NodeT::ChildNodeType;
-
         for (auto iter = node.beginChildAll(); iter; ++iter)
         {
             typename NodeT::ChildNodeType    *child;
diff --git a/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Extrapolate.cc b/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Extrapolate.cc
index 7d65d87211..0da59de659 100644
--- a/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Extrapolate.cc
+++ b/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Extrapolate.cc
@@ -576,7 +576,6 @@ SOP_OpenVDB_Extrapolate::Cache::process(
     using namespace openvdb::tools;
 
     using SamplerT = openvdb::tools::GridSampler<ExtGridT, openvdb::tools::BoxSampler>;
-    using ExtValueT = typename ExtGridT::ValueType;
 
     typename FSGridT::Ptr fsGrid = openvdb::gridPtrCast<FSGridT>(lsPrim->getGridPtr());
 
diff --git a/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Filter.cc b/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Filter.cc
index c147bdee10..a836e08a82 100644
--- a/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Filter.cc
+++ b/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Filter.cc
@@ -406,7 +406,6 @@ struct FilterOp
     void operator()(GridT& grid)
     {
         using ValueT = typename GridT::ValueType;
-        using TreeType = typename GridT::TreeType;
 
         int radius = mParms.radius;
         if (mParms.useWorldRadius) {
diff --git a/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_From_Particles.cc b/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_From_Particles.cc
index b36659cd79..d1725d84c8 100644
--- a/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_From_Particles.cc
+++ b/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_From_Particles.cc
@@ -1105,7 +1105,7 @@ SOP_OpenVDB_From_Particles::Cache::cookVDBSop(OP_Context& context)
         if (refGeo && (0 != evalInt("usereferencevdb", 0, time))) {
             const auto refName = evalStdString("referencevdb", time);
             hvdb::VdbPrimCIterator it(refGeo, matchGroup(*refGeo, refName));
-            if (const hvdb::GU_PrimVDB* refPrim = (it ? *it : nullptr)) {
+            if (const GU_PrimVDB* refPrim = (it ? *it : nullptr)) {
                 refGrid = refPrim->getGridPtr();
             } else {
                 addError(SOP_MESSAGE,
diff --git a/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Merge.cc b/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Merge.cc
index 90ccc27ed2..6c2c88c0a1 100644
--- a/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Merge.cc
+++ b/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Merge.cc
@@ -11,7 +11,7 @@
 #include <openvdb_houdini/Utils.h>
 #include <openvdb_houdini/PointUtils.h>
 #include <openvdb_houdini/SOP_NodeVDB.h>
-#include <openvdb_houdini/GEO_PrimVDB.h>
+#include <GEO/GEO_PrimVDB.h>
 
 #include <openvdb/points/PointDataGrid.h> // points::PointDataGrid
 #include <openvdb/tools/GridTransformer.h> // tools::replaceToMatch()
diff --git a/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Points_Convert.cc b/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Points_Convert.cc
index 0ba458c6c1..c4129cfa80 100644
--- a/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Points_Convert.cc
+++ b/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Points_Convert.cc
@@ -615,7 +615,7 @@ SOP_OpenVDB_Points_Convert::Cache::cookVDBSop(OP_Context& context)
                     matchGroup(*refGeo, evalStdString("refvdb", time));
 
                 hvdb::VdbPrimCIterator it(refGeo, refGroup);
-                const hvdb::GU_PrimVDB* refPrim = *it;
+                const GU_PrimVDB* refPrim = *it;
 
                 if (!refPrim) {
                     addError(SOP_MESSAGE, "Second input has no VDB primitives.");
diff --git a/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Read.cc b/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Read.cc
index 5f86f2ba97..13cc41e2a5 100644
--- a/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Read.cc
+++ b/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Read.cc
@@ -8,8 +8,8 @@
 #include <houdini_utils/ParmFactory.h>
 #include <openvdb_houdini/Utils.h>
 #include <openvdb_houdini/SOP_NodeVDB.h>
-#include <openvdb_houdini/GEO_PrimVDB.h>
-#include <openvdb_houdini/GU_PrimVDB.h>
+#include <GEO/GEO_PrimVDB.h>
+#include <GU/GU_PrimVDB.h>
 #include <UT/UT_Interrupt.h>
 #include <cctype>
 #include <stdexcept>
diff --git a/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Visualize.cc b/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Visualize.cc
index 02b4b88d8f..20c71814e0 100644
--- a/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Visualize.cc
+++ b/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Visualize.cc
@@ -1964,8 +1964,6 @@ template<typename GridType>
 void
 TreeVisualizer::operator()(const GridType& grid)
 {
-    using TreeType = typename GridType::TreeType;
-
     bool renderNodes = mParms.internalStyle || mParms.leafStyle;
     bool renderVoxels = mParms.tileStyle || mParms.voxelStyle;
 
diff --git a/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Write.cc b/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Write.cc
index c575d18fac..3d710a4b05 100644
--- a/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Write.cc
+++ b/openvdb_houdini/openvdb_houdini/SOP_OpenVDB_Write.cc
@@ -8,8 +8,8 @@
 #include <houdini_utils/ParmFactory.h>
 #include <openvdb_houdini/Utils.h>
 #include <openvdb_houdini/SOP_NodeVDB.h>
-#include <openvdb_houdini/GEO_PrimVDB.h>
-#include <openvdb_houdini/GU_PrimVDB.h>
+#include <GEO/GEO_PrimVDB.h>
+#include <GU/GU_PrimVDB.h>
 #include <PRM/PRM_Parm.h>
 #include <UT/UT_Interrupt.h>
 #include <set>
diff --git a/openvdb_houdini/openvdb_houdini/Utils.cc b/openvdb_houdini/openvdb_houdini/Utils.cc
index 24cebafbd3..17a18e1890 100644
--- a/openvdb_houdini/openvdb_houdini/Utils.cc
+++ b/openvdb_houdini/openvdb_houdini/Utils.cc
@@ -8,7 +8,7 @@
 #include "Utils.h"
 
 #include <houdini_utils/ParmFactory.h>
-#include "GEO_PrimVDB.h"
+#include <GEO/GEO_PrimVDB.h>
 #include <GU/GU_Detail.h>
 #include <UT/UT_String.h>
 #include <UT/UT_Version.h>
diff --git a/openvdb_houdini/openvdb_houdini/Utils.h b/openvdb_houdini/openvdb_houdini/Utils.h
index fd881ed9e2..5f3ab0221d 100644
--- a/openvdb_houdini/openvdb_houdini/Utils.h
+++ b/openvdb_houdini/openvdb_houdini/Utils.h
@@ -8,7 +8,7 @@
 #ifndef OPENVDB_HOUDINI_UTILS_HAS_BEEN_INCLUDED
 #define OPENVDB_HOUDINI_UTILS_HAS_BEEN_INCLUDED
 
-#include "GU_PrimVDB.h"
+#include <GU/GU_PrimVDB.h>
 #include <OP/OP_Node.h> // for OP_OpTypeId
 #include <UT/UT_SharedPtr.h>
 #include <UT/UT_Interrupt.h>
diff --git a/openvdb_houdini/openvdb_houdini/GEO_PrimVDB.cc b/openvdb_houdini/openvdb_houdini/reference/GEO_PrimVDB.cc
similarity index 100%
rename from openvdb_houdini/openvdb_houdini/GEO_PrimVDB.cc
rename to openvdb_houdini/openvdb_houdini/reference/GEO_PrimVDB.cc
diff --git a/openvdb_houdini/openvdb_houdini/GEO_PrimVDB.h b/openvdb_houdini/openvdb_houdini/reference/GEO_PrimVDB.h
similarity index 99%
rename from openvdb_houdini/openvdb_houdini/GEO_PrimVDB.h
rename to openvdb_houdini/openvdb_houdini/reference/GEO_PrimVDB.h
index ba09cf8661..34fe700523 100644
--- a/openvdb_houdini/openvdb_houdini/GEO_PrimVDB.h
+++ b/openvdb_houdini/openvdb_houdini/reference/GEO_PrimVDB.h
@@ -17,10 +17,13 @@
  */
 
 
-// Using the native OpenVDB Primitive shipped with Houdini is strongly recommended,
-// as there is no guarantee that this code will be kept in sync with Houdini.
-// However, for debugging it can be useful, so supply -DSESI_OPENVDB_PRIM to
-// the compiler to build this custom primitive.
+// Using the native OpenVDB Primitive shipped with Houdini is strongly
+// recommended, as there is no guarantee that this code will be kept in sync
+// with Houdini.
+// This code is provided to help ensure algorithms in the provided SOPs
+// can be re-implemented by revealing the otherwise the hidden implementations.
+// It is possible to replace Houdini's GU_PrimVDB with this, but no
+// official support for that remains.
 
 #if !defined(SESI_OPENVDB) && !defined(SESI_OPENVDB_PRIM)
 
diff --git a/openvdb_houdini/openvdb_houdini/GU_PrimVDB.cc b/openvdb_houdini/openvdb_houdini/reference/GU_PrimVDB.cc
similarity index 100%
rename from openvdb_houdini/openvdb_houdini/GU_PrimVDB.cc
rename to openvdb_houdini/openvdb_houdini/reference/GU_PrimVDB.cc
diff --git a/openvdb_houdini/openvdb_houdini/GU_PrimVDB.h b/openvdb_houdini/openvdb_houdini/reference/GU_PrimVDB.h
similarity index 96%
rename from openvdb_houdini/openvdb_houdini/GU_PrimVDB.h
rename to openvdb_houdini/openvdb_houdini/reference/GU_PrimVDB.h
index b4220d7ab4..c0dfeb62e4 100644
--- a/openvdb_houdini/openvdb_houdini/GU_PrimVDB.h
+++ b/openvdb_houdini/openvdb_houdini/reference/GU_PrimVDB.h
@@ -18,10 +18,13 @@
 
 #include <UT/UT_Version.h>
 
-// Using the native OpenVDB Primitive shipped with Houdini is strongly recommended,
-// as there is no guarantee that this code will be kept in sync with Houdini.
-// However, for debugging it can be useful, so supply -DSESI_OPENVDB_PRIM to
-// the compiler to build this custom primitive.
+// Using the native OpenVDB Primitive shipped with Houdini is strongly
+// recommended, as there is no guarantee that this code will be kept in sync
+// with Houdini.
+// This code is provided to help ensure algorithms in the provided SOPs
+// can be re-implemented by revealing the otherwise the hidden implementations.
+// It is possible to replace Houdini's GU_PrimVDB with this, but no
+// official support for that remains.
 
 #if !defined(SESI_OPENVDB) && !defined(SESI_OPENVDB_PRIM)
 
diff --git a/pendingchanges/assert.txt b/pendingchanges/assert.txt
new file mode 100644
index 0000000000..76502ccd00
--- /dev/null
+++ b/pendingchanges/assert.txt
@@ -0,0 +1,6 @@
+OpenVDB:
+  Improvements:
+    - Added openvdb::assertAbort to replace cassert and a OPENVDB_ENABLE_ASSERTS
+    cmake argument/compile define to toggle assertions in OpenVDB code,
+    independantly of NDEBUG. Asserts are no longer enabled by default in
+    when NDEBUG is absent (e.g. Debug builds).
diff --git a/pendingchanges/ax_windows_cmake_fix.txt b/pendingchanges/ax_windows_cmake_fix.txt
new file mode 100644
index 0000000000..d022feaa4e
--- /dev/null
+++ b/pendingchanges/ax_windows_cmake_fix.txt
@@ -0,0 +1,3 @@
+Build:
+    - Fixed an issue with OpenVDB AX's CMake on Windows where the static and shared library targets would have the same name
+    [Reported by Nicholas Yue]
diff --git a/pendingchanges/explicit_inst_win.txt b/pendingchanges/explicit_inst_win.txt
new file mode 100644
index 0000000000..f86db01b8d
--- /dev/null
+++ b/pendingchanges/explicit_inst_win.txt
@@ -0,0 +1,3 @@
+Build:
+    - USE_EXPLICIT_INSTANTIATION is now disabled on Windows by default due to
+    OOM linker issues.
diff --git a/pendingchanges/houdinirpath.txt b/pendingchanges/houdinirpath.txt
new file mode 100644
index 0000000000..2165afe686
--- /dev/null
+++ b/pendingchanges/houdinirpath.txt
@@ -0,0 +1,3 @@
+Houdini:
+    - When OPENVDB_ENABLE_RPATH is ON, the location of libopenvdb_houdini is now
+    added to the rpath of all Houdini dsos.
diff --git a/pendingchanges/jemalloc.txt b/pendingchanges/jemalloc.txt
new file mode 100644
index 0000000000..4db76f91b2
--- /dev/null
+++ b/pendingchanges/jemalloc.txt
@@ -0,0 +1,3 @@
+Build:
+    - Jemalloc is now the preferred allocator of choice on all platforms when 
+    CONCURRENT_MALLOC is set to Auto.
diff --git a/pendingchanges/nanovdb_32.7.txt b/pendingchanges/nanovdb_32.7.txt
new file mode 100644
index 0000000000..78e2f71d01
--- /dev/null
+++ b/pendingchanges/nanovdb_32.7.txt
@@ -0,0 +1,85 @@
+Bug fix:
+nanovdb::readGrids works with raw grid buffer.
+
+Improvements:
+Restructure files location and namespace to be more align with OpenVDB. The
+namespaces touched by the restructuring are: io, cuda, util, tools, and math.
+Add two scripts updateFiles.sh and updateFiles.py to update the files using
+NanoVDB. The script updateFiles.py works on both Windows and Linux.
+For a more complete list of changes, see API Changes (details).
+
+cuda::PointsToGrid supports target density.
+Add support for NanoVDB Grid of type UInt8.
+Add ability to use externally managed CUDA buffer.
+Add create methods for CudaDeviceBuffer and exceptions.
+Improve GridValidator logic, e.g. include check for grid count.
+Add operator > and >= for class Coord according to lexicographical order.
+Add toCodec to convert string to Codec enumeration type.
+Add nanovdb::strlen<GridType>().
+Add strncpy util.
+Add NANOVDB_DISABLE_SYNC_CUDA_MALLOC that maps cudaMallocAsync and
+cudaFreeAsync to cudaMalloc and cudaFree respectively.
+Add guard to UINT64_C.
+Remove use of cudaMallocAsync in PointsToGrid.cuh.
+Align PNanoVDB blind metadata to NanoVDB.
+
+API Changes:
+Change mapToGridType to toGridType.
+Change mapToMagic to toMagic.
+Change CpuTimer.h to Timer.h.
+
+API Changes (details):
+These APIs are now under the math namespace: Ray, DDA, HDDA, Vec3, Vec4, BBox,
+ZeroCrossing, TreeMarcher, PointTreeMarcher, BoxStencil, CurvatureStencil,
+GradStencil, WenoStencil, AlignUp, Min, Max, Abs, Clamp, Sqrt, Sign, Maximum,
+Delta, RoundDown, pi, isApproxZero, Round, createSampler, SampleFromVoxels.
+
+These APIs are now under the tools namespace: createNanoGrid, StatsMode,
+createLevelSetSphere, createFogVolumeSphere, createFogVolumeSphere,
+createFogVolumeSphere, createFogVolumeTorus, createLevelSetBox, CreateNanoGrid,
+updateGridStats, evalChecksum, validateChecksum, checkGrid, Extrema.
+
+These APIs are now under the util namespace: is_floating_point, findLowestOn,
+findHighestOn, Range, streq, strcpy, strcat, empty, Split, invoke, forEach,
+reduce, prefixSum, is_same, is_specialization, PtrAdd, PtrDiff.
+
+Move nanovdb::build to nanovdb::tools::build.
+Rename nanovdb::BBoxR to nanovdb::Vec3dBBox.
+Rename nanovdb::BBox<nanovdb::Vec3d> to nanovdb::Vec3dBbox.
+Move nanovdb::cudaCreateNodeManager to nanovdb::cuda::createNodeManager.
+Move and rename nanovdb::cudaVoxelsToGrid to nanovdb::cuda::voxelsToGrid.
+Move and rename nanovdb::cudaPointsToGrid to nanovdb::cuda::pointsToGrid.
+Move nanovdb::DitherLUT to nanovdb::math::DitherLUT.
+Move and rename nanovdb::PackedRGBA8 to nanovdb::math::Rgba8.
+Move nanovdb::Rgba8 to nanovdb::math::Rgba8.
+Move and rename nanovdb::CpuTimer to nanovdb::util::Timer.
+Move nanovdb::GpuTimer to nanovdb::util::cuda::Timer.
+Move and rename nanovdb::CountOn to nanovdb::util::countOn.
+
+Move util/GridHandle.h to GridHandle.h.
+Move util/BuildGrid.h to tools/GridBuilder.h.
+Move util/GridBuilder.h to tools/GridBuilder.h.
+Move util/IO.h to io/IO.h.
+Move util/CSampleFromVoxels.h to math/CSampleFromVoxels.h.
+Move util/DitherLUT.h to math/DitherLUT.h.
+Move util/HDDA.h to math/HDDA.h.
+Move util/Ray.h to math/Ray.h.
+Move util/SampleFromVoxels.h to math/SampleFromVoxels.h.
+Move util/Stencils.h to nanovdb/math/Stencils.h.
+Move util/CreateNanoGrid.h to tools/CreateNanoGrid.h.
+Move and rename util/Primitives.h to tools/CreatePrimitives.h.
+Move util/GridChecksum.h to tools/GridChecksum.h.
+Move util/GridStats.h to tools/GridStats.h.
+Move util/GridChecksum.h to tools/GridChecksum.h.
+Move util/GridValidator.h to tools/GridValidator.h.
+Move util/NanoToOpenVDB.h to tools/NanoToOpenVDB.h.
+Move util/cuda/CudaGridChecksum.cuh to tools/cuda/CudaGridChecksum.cuh.
+Move util/cuda/CudaGridStats.cuh to tools/cuda/CudaGridStats.cuh.
+Move util/cuda/CudaGridValidator.cuh to tools/cuda/CudaGridValidator.cuh.
+Move util/cuda/CudaIndexToGrid.cuh to tools/cuda/CudaIndexToGrid.cuh.
+Move and rename util/cuda/CudaPointsToGrid.cuh to tools/cuda/PointsToGrid.cuh.
+Move util/cuda/CudaSignedFloodFill.cuh to tools/cuda/CudaSignedFloodFill.cuh.
+Move and rename util/cuda/CudaDeviceBuffer.h to cuda/DeviceBuffer.h.
+Move and rename util/cuda/CudaGridHandle.cuh to cuda/GridHandle.cuh.
+Move and rename util/cuda/CudaUtils.h to util/cuda/Util.h.
+Move and consolidate util/cuda/GpuTimer.h to util/cuda/Timer.h.
diff --git a/pendingchanges/tsan_spheres.txt b/pendingchanges/tsan_spheres.txt
new file mode 100644
index 0000000000..150bdb7803
--- /dev/null
+++ b/pendingchanges/tsan_spheres.txt
@@ -0,0 +1,5 @@
+OpenVDB:
+  Bug Fixes:
+    - Fixed a thread sanitizer issue which could cause undefined behaviour
+    in VolumeToSpheres::fillWithSpheres
+    [Reported by Jérémie Dumas]
diff --git a/pendingchanges/ubsan.txt b/pendingchanges/ubsan.txt
new file mode 100644
index 0000000000..f050fd29e0
--- /dev/null
+++ b/pendingchanges/ubsan.txt
@@ -0,0 +1,3 @@
+OpenVDB:
+	- Bug Fixes::
+		- Fixed an occurance of undefined behaviour in tools::activate (though this would typically not have manifested with any unintended behaviour)
diff --git a/pendingchanges/windows_static_blosc.txt b/pendingchanges/windows_static_blosc.txt
new file mode 100644
index 0000000000..882dc1f3fa
--- /dev/null
+++ b/pendingchanges/windows_static_blosc.txt
@@ -0,0 +1,2 @@
+Build:
+    - Fixed an issue with the Blosc CMake FindPackage for the OpenVDB Windows static library.
diff --git a/tsc/meetings/2023-01-24.md b/tsc/meetings/2023-01-24.md
new file mode 100644
index 0000000000..966a406bfa
--- /dev/null
+++ b/tsc/meetings/2023-01-24.md
@@ -0,0 +1,125 @@
+Minutes from OpenVDB TSC meeting, January 24, 2023
+
+Attendees: *Jeff* L., *Andre* P, *Dan* B., *Ken* M., *Nick* A., *Greg* H.
+
+Additional Attendees: JT Nelson (Blender)
+
+Regrets: *Rich* J.
+
+Agenda:
+
+1) Confirm quorum
+2) Secretary
+3) Broken Houdini download link
+4) Open PR's
+5) Fracture CSG tools
+6) Active states
+7) Sharpening filter
+
+
+------------
+
+1) Confirm quorum
+
+Quorum is present.
+
+2) Secretary
+
+Secretary is Greg Hurst.
+
+3) Broken Houdini download link
+
+Houdini download link is broken at openvdb.org downloads page. Happened during git lfs switch. Need to upload regularly using git.
+
+File not updated often, so it's ok to not use git lfs and just use regular git commit.
+
+Maybe the URL needs to change in the html source when using git lfs?
+
+4) Open PR's
+
+Python bindings (1515)
+* Ken will try to look at the PR tomorrow
+* Recreate the PR to remedy the CLA issues and credit the original author
+* Or squash everything and go as if everything is just one commit
+
+Switch to using the static asserts (1522)
+* Why do we have a special wrapper for the static assert? 
+* NANOVDB_ASSERT instead of the static_assert
+* Soon nanovdb will require C++17 (waiting on pnanovdb)
+* Ideally we'd have #ifdef platform instead of #if 1, so keep skeleton code present through #if 0
+* Ken will approve and merge
+
+Prefer fixed-width integer types instead of size_t (1528)
+* Awaiting another approval -- Dan will approve
+
+Add missing separate_arguments cmake call (1534)
+* Needs another look -- not entirely clear why this was added
+* Splits list of arguments in case they're separated by non-standard delimiters
+* Perhaps re-ask OP what failed / why make this PR
+
+Support for IlmBase versions < 3.1 is deprecated and will be removed (1529)
+* vdbtool stuff
+* This PR looks to remove support for old version
+
+Remove the explicit default assignment operator (1530)
+* Remove explicit default assignment operator in nanovdb
+* Once something is given a default, you need to set default for other things too
+* More defaults need to be removed in the same file before approval
+
+Consolidated ValueAccessor implementations (1547)
+* Perhaps someone can build and see if everything still works, test against Houdini, etc.
+* Implementation related questions added to the PR by Dan
+* The override specifiers might be redundant
+* Need to add missing isCached in code base in similar piece of code.
+
+Fix Segfault in Projection Mode of VDB Advect Points SOP (1559)
+* Just awaiting approval from Jeff
+
+Fix all the int-in-bool-context warnings with GCC9 (1563)
+* Switch to use constexpr
+* Still need macros to guard type conversions? (Node type conversion warning, just relevant to float portion now)
+* LOD removed for bool grids
+
+5) Fracture CSG tools
+
+https://github.com/AcademySoftwareFoundation/openvdb/issues/1566
+
+Seemless free cuts -- how can you do this with OpenVDB? Can do it in Houdini though since it has robust mesh support.
+
+Really need this to make your split-frame free of artifacts and for water-tight union.
+
+Our current choice is to not support robust mesh computation. Currently OpenVDB just has polygon soup. Mainly used for translation purposes. Robust support could lead us down a rabbit hole.
+
+By templating our meshes, it's probably not clear that if you wrote your own accelerated structure, what methods you need.
+
+Houdini seems to use everything in OpenVDB here, and so we could return polygon soup and edge data list (MeshToVoxelEdgeData) and the user can do it. The SOP can be a reference.
+
+Point OP to this SOP / OpenVDB methods.
+
+6) Active / Inactive States
+
+What should the default behavior be and how to expose different functionalities.
+
+Default behavior proposed: Max of values and if either is active make result active.
+
+Currently the activeness of states is not being brought over.
+
+Not more efficient to make 2 passes when combining multiple grids. Loses cache coherency.
+2 passes node-wise will be more efficient than 2 passes tree-wise.
+
+Do we want the ability to handle active states differently? 
+Maybe we have a use to ignore when any grid has an inactive value. 
+
+max( (0.0, inactive), (-1.0, active) ) --> (0.0, active) or (0.0, inactive)?
+
+What does it mean for a fog volume node to be active? Tags (GRID_LEVEL_SET, etc in enum GridClass) give implicit meaning to active states / values
+Majority of tools don't seem to normalize fogs to lie between 0 and 1. Difficult to maintain this discipline.
+
+Make sure to make whatever choices extendable. Tricky part is coming up with different patterns. 
+Selection merge and reduction merge, etc.
+
+7) Sharpening filter
+
+Switch away from boost dependencies & add unit tests.
+
+Seems like updated PR could be around the corner.
diff --git a/tsc/meetings/2023-05-02.md b/tsc/meetings/2023-05-02.md
new file mode 100644
index 0000000000..b75abfcd72
--- /dev/null
+++ b/tsc/meetings/2023-05-02.md
@@ -0,0 +1,89 @@
+Minutes from OpenVDB TSC meeting, May 02, 2023
+
+Attendees: *Jeff* L., *Andre* P, *Ken* M., *Greg* H., *Dan* B.
+
+Additional Attendees: 
+
+Regrets: *Rich* J., *Nick* A.
+
+Agenda:
+
+1) Confirm quorum
+2) Secretary
+3) SIGGRAPH 2023
+4) Website broken link
+5) Root node offset
+6) I/O revamp
+
+------------
+
+
+1) Confirm quorum
+
+Quorum is present.
+
+
+2) Secretary
+
+Secretary is Greg Hurst.
+
+
+3) SIGGRAPH 2023
+
+Course accepted with good feedback from reviewers.
+Must sign over copyrights for anything in the presentation. Need to do this now.
+Course material must be submitted by June 5. Option to revise slides by August 11.
+
+For ASWF: tentatively do Bird of a Feather and advertise SIGGRAPH course.
+
+
+4) Website broken link
+
+PR for broken Houdini link to be merged.
+https://github.com/AcademySoftwareFoundation/openvdb-website/pull/71
+
+Fixed link:
+https://www.openvdb.org/download/files/houdini_examples.hip-1.0.1.zip
+
+
+5) Root node offset
+
+Root node dense, all other nodes are dense. Root essentially hash table.
+
+Since root is sparse, root access is slower. Tend to avoid touching the root node. e.g. value accessors.
+
+Root is centered at origin (0, 0, 0), and so a small sphere centered at the origin makes 8 children.
+
+The offset mitigates this issue. 
+
+Root node now has mOrigin member, just like all other nodes (added in v10)
+
+Currently mOrigin is hard coded to origin still and even has checks to throw errors if not.
+
+First pass tried to hard code half offset (-2048, -2048, -2048) but saw no measurable speedup.
+
+Can we make mOrigin anything? If so looks like we will have massive overhead -- merging trees, etc will need to rebuild tree structure.
+
+If you guarantee that the root node is aligned with grandchild of other root
+e.g. If mOrigin is a multiple of 128, then only misaligned is child nodes of the root.
+And so during these operations, only root node needs to be rebuilt.
+It _can_ generalize to arbitrary fan factors but need different number from 128.
+2 level tree is a special case, but n (>= 3) follows above logic.
+
+What is the impact on the existing code? CSG, Combinations, etc.
+Merging 2 grids with incommensurate origins is tricky if const operators... duplicate data etc.
+
+How to maintain backward compatibility for I/O if we just hardcode (-2048, -2048, -2048)?
+And (it seems) that's the only backward compatibility to suss out.
+Export will need to recenter to (0, 0, 0)?
+I/O needs to be refactored anyway...
+Hardcoded global offset means we don't need to explicitly export it
+
+Ken will investigate and do deep dive
+
+
+6) I/O revamp
+
+Would be good to investigate into I/O revamp. 
+Come up with a list of modern requirements. 
+Refer to this list in future development efforts.
diff --git a/tsc/meetings/2023-09-05.md b/tsc/meetings/2023-09-05.md
new file mode 100644
index 0000000000..93203b7d91
--- /dev/null
+++ b/tsc/meetings/2023-09-05.md
@@ -0,0 +1,129 @@
+Minutes from OpenVDB TSC meeting, September 5, 2023
+
+Attendees: *Jeff* L., *Rich* J., *Ken* M., *Greg* H., *Dan* B., *Andre* P.
+
+Additional Attendees: 
+
+Regrets: *Nick* A.
+
+Agenda:
+
+1) Confirm quorum
+2) Secretary
+3) VTT
+4) VDB Maya
+5) V10.1
+6) PRs
+
+------------
+
+
+1) Confirm quorum
+
+Quorum is present.
+
+
+2) Secretary
+
+Secretary is Greg Hurst.
+
+
+3) VTT
+
+Autodesk has a product call Bifrost (sim framework)
+
+Internal multires grid
+
+NanoVTT github repo expires in September... but it's a fork of OpenVDB?
+
+Bifrost group seems gunghoe about open sourcing it
+
+Why open source it?
+Integration of nanovtt into OpenVDB will be intricate.
+Attend meetings, contribute to the CI is a good start, but will be much more complicating. What's the balance?
+
+Sampling across tiles is tricky and they have the method they want to use -- could be advantageous to open source as a standard
+
+Why should this be part of OpenVDB and not its own product? Best not to have competing formats
+But how can the two coexist in a meaningful way? Can't just have two independent things
+
+OpenVDB has threadpools, math functions, metadata, transforms, etc. And a standard API. VTT could integrate into these.
+
+VDB's are sparse (active / inactive, etc)
+VTT's is in some sense dense, but adaptive
+Complementary data structures
+
+This is an opportunity to rip out delayed loading for vdb
+We can have a family of grids that perform and specialize in different use cases
+When we write tools, what grids should & could these tools support?
+
+Could this be confusing to general users?
+Is VTT too similar sounding to VDB
+
+We will need support from them integrating properly
+We need commitment to delivering everything, not just nanovtt
+
+Another need is conversion between vdb and vtt, something that's missing at the moment
+
+Can we iterate of vtt grids in similar fashions (API-wise at least) to DynamicNodeManager?
+
+If they first just give us NanoVTT, then they write a converter, is that even a meaningful thing?
+OpenVDB grid does not contain adaptive information, but possible ways one might want to convert
+
+How does VTT compare to a stack of VDBs?
+
+Did VTT mention point support at all? Points to volume mentioned in their ppt
+
+Mathematica link to vtt? Probably, yes
+
+**********
+
+we agree we don't want just NanoVTT
+C++ structure for non-NanoVTT should have:
+  VTT needs a sampler
+  way to save and load from disk
+  NodeManager-esque interfaces
+  Converters
+  Random access
+
+**********
+
+Worth asking them about feasibility of above and what they have in the bifrost SDK
+
+Let's organize all of this in a Google doc to establish minimally required features.
+
+What version would this go into?
+This will change ABI? and so V12 integration?
+
+Probably would inherit GridBase without a Tree pointer.
+
+
+4) VDB Maya
+
+What happens to VDB Maya now?
+
+Probably broken at this point... 
+
+Should we just move it to its own repo and retire it from OpenVDB repo?
+
+It's a useful reference and useful starting point.
+
+Who own's the separate repo, etc...
+
+What about deleting from git repo but keep folder with a text file saying to go to a branch to find it?
+
+
+5) V10.1
+
+Ellipsoid stuff still being worked on
+
+Just push out what we have now
+
+
+6) PR
+
+PR 1651 suffering from TBB build errors: 
+  https://github.com/oneapi-src/oneTBB/issues/301
+  Bumping up to TBB 2021.2 will probably fix this
+PR 1655 needs a look
+PR 1666 on fast sweeping needs to be refactored
diff --git a/tsc/meetings/2023-10-17.md b/tsc/meetings/2023-10-17.md
new file mode 100644
index 0000000000..88b39cf93f
--- /dev/null
+++ b/tsc/meetings/2023-10-17.md
@@ -0,0 +1,65 @@
+Minutes from OpenVDB TSC meeting, October 17, 2023
+
+Attendees: *Jeff* L., *Andre* P, *Ken* M., *Greg* H., *Dan* B., *Rich* J., *Nick* A.
+
+Additional Attendees: 
+
+Regrets:
+
+Agenda:
+
+1) Confirm quorum
+2) Secretary
+3) PRs
+4) Root Node TBB concurrent hash node
+5) ABI changes in NanoVDB
+6) VDB 11
+
+------------
+
+
+1) Confirm quorum
+
+Quorum is present.
+
+
+2) Secretary
+
+Secretary is Greg Hurst.
+
+
+3) PRs
+
+1687 need another approval and CLI
+1685 abstraction layer. 
+  122 files changed
+  wrapper around TBB
+  higher level wrappers in nano:
+    for_each, reduce, ...
+    define a functor that you do parallel_reduce over
+    std::thread implementation & serial fallback functionality
+  similar TBB mechanisms we use in the codebase, we should consolidate this at first
+  use case here is if you have your own thread pool
+  another is to be able to build OpenVDB without TBB
+  
+  OpenVDB needs to have it's own highlevel parallel functionality that just uses TBB under the hood
+  grainsize is important keep exposed
+1679
+  checking for the grid but dereferencing the iterator
+  looks good to go
+  
+4) Root Node TBB concurrent hash node
+  violates ABI?
+  we should get rid of this
+
+5) ABI changes in NanoVDB
+  Nano has its own versioning system
+  Do we adopt same ABI change policies for nano used in OpenVDB
+  So now is the time to change the ABI
+
+6) VDB 11
+  Need people to look at infrastructure changes
+  Removes support for ilmBase
+  Does this build with Houdini 20?
+    10.1 works just fine
+  Closer to removing Boost dependencies (still optional for delayed loading)
diff --git a/tsc/meetings/2023-11-07.md b/tsc/meetings/2023-11-07.md
new file mode 100644
index 0000000000..29a88e0d47
--- /dev/null
+++ b/tsc/meetings/2023-11-07.md
@@ -0,0 +1,67 @@
+Minutes from OpenVDB TSC meeting, November 7th, 2023
+
+Attendees: *Ken* M., *Jeff* L., *Andre* P, *Dan* B., *Greg* H.
+
+Regrets:
+Attendees: *Nick* A.
+
+Additional Attendees: 
+Alexandre Vignoux (SideFX)
+
+Agenda:
+
+1) Confirm quorum
+2) Secretary
+3) Half support
+4) Post mortem VDB 11 release
+5) Dilation at leaf level for NanoVDB
+6) PR-1678
+7) Update documentation
+8) CMake config file
+9) Using VDB SOPs inside of DOPs
+10) Annual update on TAC
+11) Next meeting
+
+------------
+
+1) Confirm quorum
+
+Quorum is present.
+
+2) Secretary
+
+Secretary is Andre Pradhana.
+
+3) Half support
+Andre and Greg will meet to discuss what needs to be implemented for Autodesk
+purposes. As for the implementation, we may need to promote the Half type to a
+float type when doing arithmetic.
+
+4) Post mortem VDB 11 release
+We ended up meeting the VFX reference platform deadline. Ken sent out an
+announcement on the email list that 11.0.0 is out there. We still need to fix
+weekly builds on the CI.
+
+5) Dilation at leaf level for NanoVDB
+Cuda’s points-to-voxel grid can copy 26 points. It will be nice to have a
+dedicated tool to dilate at the leaf level. Andre will touch base with Alexandre
+to follow up on this effort.
+
+6) PR-1678
+This is no longer needed as the CMake module is no longer there.
+
+7) Update documentation
+We need to update the installation which is currently openexr 2.x to OpenEXR 3.x.
+
+8) CMake config file
+It will be good to have a nice OpenVDB CMake config file.
+
+9) Using VDB SOPs inside of DOPs
+It’s supposed to work with zero-copy.
+
+10) Annual update on TAC
+There is an annual project update for the TSC on November 29, 2023, which may
+require some material.
+
+11) Next meeting
+Next meeting is November 14, 2023.
diff --git a/tsc/meetings/2023-12-19.md b/tsc/meetings/2023-12-19.md
new file mode 100644
index 0000000000..37a288b8f1
--- /dev/null
+++ b/tsc/meetings/2023-12-19.md
@@ -0,0 +1,89 @@
+Minutes from OpenVDB TSC meeting, December 19th, 2023
+
+Attendees: *Jeff* L., *Andre* P, *Dan* B., *Ken* M.
+
+Additional Attendees: 
+
+Regrets: *Nick* A., *Rich* J.
+
+Agenda:
+
+1) Confirm quorum
+2) Secretary
+3) Autodesk VTT
+4) Read-Only Grids
+5) Half Grid Types
+6) Next meeting
+
+------------
+
+1) Confirm quorum
+
+Quorum is present.
+
+2) Secretary
+
+Secretary is Dan Bailey.
+
+3) Autodesk VTT
+
+Potential 9th January meeting to discuss VTT responses with Autodesk. To be
+proposed.
+
+NanoVTT is being pitched for integration now, VTT for later. VTT is not being
+proposed or considered as a replacement for VDB, will be complementary.
+
+One of the key questions is around GridBase/TreeBase integration. Autodesk
+suggests TreeBase will be easier, but it is thought there may have been some
+confusion there as GridBase is many fewer methods. Curious to understand the
+concern better. The Autodesk response appears to rely on expectation that
+integrating greater functionality to the tree/grid classes is desired by TSC,
+however worth sharing back that efforts have been underway for years to extract
+functionality from the object-oriented hierarchy classes and to move more
+towards using free-standing tools. Perhaps a better candidate would be to start
+from NanoVTT and work towards making that a canidate for VTT.
+
+Autodesk not opposed to the idea about using VDB transforms. Question over
+transforms leads to the assumption that VTT does have transforms but that it
+exists at a higher-level than VTT in the BiFrost API. Would be good to confirm
+this.
+
+Would like to propose an MVP for VTT in addition to considering NanoVTT.
+Candidates are conversion (VDB <-> VTT), serialization, sampling and
+re-grading.
+
+Reflecting on NanoVDB experience, one learning is that NanoVDB grids were
+originally pitched as read-only, however subsequent realization was that
+obtaining the memory layout would allow room for modifying the grid values.
+GridHandle is one of the more awkward components of NanoVDB that is simpler in
+OpenVDB.
+
+While three of the MVP requirements appear to suggest an ability to pre-allocate
+memory, grading is expected to require dynamic memory management in VTT.
+
+4) Read-Only Grids
+
+Question over whether there is value in having a VDB grid that uses contiguous
+memory to accelerate performance. Even with an allocator like jemalloc, the
+smaller fragments of memory in VDB are a considerable source of slowness.
+NanoVDB is a grid with a contiguous block of memory and exists on the host as
+well as the device. Limitation is that you cannot deserialize vdb files
+directly into NanoVDB grids, you have to go via VDB or use nvdb files.
+
+5) Half Grid Types
+
+Brief discussion of Andre's work adding half grid support. Looking very
+promising. Main component missing is VDB I/O which should be alleviated by
+registering the VDB half grid type. Worth also adding static asserts to disable
+tools that either do not support half grid types or have not yet been tested.
+
+Subsequent discussion around extending the std namespace being UB. Ken points
+out that this is already done in the math/Coord around hash usage, this should
+be resolved. The approach being taken in this PR is the right approach by using
+openvdb::is_floating_point. Will also be needed in the pybind11 functionality
+that Matt Cong is working on due to needing to specialize std::is_arithmetic.
+
+6) Next meeting
+
+Next meeting is on January 9th, 2024. Intention is to invite Autodesk. 2pm-3pm
+EDT (GMT-4).
diff --git a/tsc/meetings/2024-01-16.md b/tsc/meetings/2024-01-16.md
new file mode 100644
index 0000000000..393c72c4ab
--- /dev/null
+++ b/tsc/meetings/2024-01-16.md
@@ -0,0 +1,106 @@
+Minutes from OpenVDB TSC meeting, January 16th, 2024
+
+Attendees: *Jeff* L., *Dan* B., *Ken* M., *Richard* J., *Andre* P.,
+           *Greg* H.
+
+Additional Attendees: Marten Bojsen-Hansen (Autodesk),
+                      Paul Mark di Francesco (SideFX), Phill Mayer (Autodesk),
+                      Yannick Pomerleau, Jonah Friedman
+
+Regrets: *Nick* A.,
+
+Agenda:
+
+1) Confirm quorum
+2) Secretary
+3) Autodesk VTT
+4) Next meeting
+
+------------
+
+1) Confirm quorum
+
+Quorum is present.
+
+2) Secretary
+
+Secretary is Andre Pradhana.
+
+3) Autodesk VTT
+
+The TSC likes the idea of incorporating VTT into the VDB project because VTT
+is an orthogonal and complementary data structure to a VDB. VDB is sparse
+(VDB has a notion of an activation) but non-adaptive. VTT is dense, but adaptive
+in values. Fundamentally, it covers two cases of adaptive versus sparse.
+It makes sense to have both in one system.
+
+TSC asks if the name VTT can be changed because it sounds too similar to VDB,
+which may cause confusion when discussing the data structure. It is good to
+highlight the feature of VTT being dense and adaptive. The Autodesk team is
+not married to the name VTT.
+
+Tile means different things in VDB and VTT. In VDB this means a constant value
+that covers an index space that is larger than one by one by one (used for
+example for the interior of a level-set). In VTT, a tile is a node.
+
+There is some concern about repeated implementations of algorithms that are
+subtly different for different cases. Perhaps VDB can break things down into
+more building blocks, which facilitates better integration with having an
+adaptive grid. One suggestion is to have a superclass of the BaseGrid that
+splits into a sparse grid versus an adaptive grid. Specific tools, such as
+convert to polygons, should accept either type and know how to switch itself.
+
+Autodesk is looking to open-source NanoVTT first. TSC reiterates that we are
+interested in the C++ implementation of VTT also. We should make sure that there
+is a way forward to incorporate VTT if NanoVTT is already incorporated into the
+VDB project.
+
+A lesson from the VDB project is that object-oriented nature of the data
+structure is limiting. We are trying to move away, e.g. by moving signed
+flood fill out of the leaf nodes or a tree into tools.
+
+VTT has a lossy conversion to VDB because the adaptive information is not
+supported in VDB. Needs more clarification on the conversion from a VDB to a
+NanoVTT and if this conversion requires the whole OpenVTT library.
+
+VTT is considering having two types of grids, one static and another dynamic.
+If a user needs to do modification (e.g. create a new grid), they can switch to
+switch to a dynamic grid.
+
+Right now, VTT has a general serialization format in Bifrost. The hope is to
+use .vdb format if it gets adopted. Autodesk has a conversion from VTT to a
+single VDB right now, but they can look into converting a VTT into a stack of
+VDBs. It is lossy. It uses Bifrost SDK.
+
+VTT doesn’t have a transform. It is usually handled by higher-level tools,
+such as Maya or Arnold. One possibility is to use a VDB-like transform in VTT,
+but more conversation is needed for a technical decision. The caveat is VDB has
+a frustum transform. There is a question about limiting the dense volume to a
+meter cubed, which works well with frustum transform.
+
+On frustum transform, Houdini can produce this. Dan Bailey added a feature for
+the frustum rasterizer.
+
+NanoVTT was first created to address the issue that software, other than Arnold,
+should be able to render VTT volumes and to move to converge to VDB.
+
+We need to focus on the Minimum Viable Product (MVP)  and be pragmatic:
+ - The conversion from VTT to VDB
+ - Serialization to .vdb format
+ - Sampling
+ - Regrading aspect of it. Top up and bottom down regrading.
+   (like a refinement and coarsening oracle on an adaptive tree).
+
+VTT team says that this is along the line of what they are thinking because
+Bifrost is a visual programming language. They are thinking about what
+traversals should be exposed to users to use the volumes in Bifrost.
+
+There is a VTT Cookbook example.
+
+Ideas to follow up on: create a super base class, more thoughts on transform.
+The main concern is that we do not want to fragment the VDB project. We need to
+follow up with a written feedback.
+
+4) Next meeting
+
+Next meeting is on January 23rd, 2024. 2pm-3pm EDT (GMT-4).
diff --git a/tsc/meetings/2024-01-23.md b/tsc/meetings/2024-01-23.md
new file mode 100644
index 0000000000..2889673578
--- /dev/null
+++ b/tsc/meetings/2024-01-23.md
@@ -0,0 +1,103 @@
+Minutes from OpenVDB TSC meeting, January 23rd, 2024
+
+Attendees: *Jeff* L., *Andre* P, *Dan* B., *Ken* M.,  *Nick* A.,
+*Rich* J., *Greg* H.
+
+Additional Attendees: 
+Kolton Yager (DWA)
+
+Regrets: None
+
+Agenda:
+
+1) Confirm quorum
+2) Secretary
+3) Autodesk VTT
+4) Read-Only Grids
+5) Half Grid Types
+6) Next meeting
+
+------------
+
+1) Confirm quorum
+
+Quorum is present.
+
+2) Secretary
+
+Secretary is Jeff Lait.
+
+3) Discussion of VTT Meeting.
+
+We threw ideas at them, but maybe weren't very concrete.
+
+We need to not move the goal posts on them.  Has anyone actually
+looked at how this fits in?  We seem to have increased our theoritical
+alignment, but we need to give a clear skeleton of where they should
+inject VTT.  
+
+Can it derive from TreeBase?  Or GridBaseBase?
+
+Can we provide a Dense version of TreeBase or GridBase?  And provide a
+directory to dump their tools into?  We can make a github branch they
+can work off for this.
+
+IO can save via the tree base.
+
+Name change proposal.  What would it look like?  Adaptive VDB vs
+Sparse VDB.  Much like Cartesian grids don't exist, we name the
+special one, we could have Adaptive VDB and VDB.  The C++ baseline
+doesn't have a VDB object so it could be Sparse there.  
+
+Tile is a harder word that is more deep in VTT.  Tile should be Node.  
+
+We could create a Dense VDB PR that is similar to what a Adapative VDB
+should look like.  Or we add an Adaptive tree with subdirectory.  It
+could be a dense grid, just called Adaptive.  
+
+nanovtt should be nanoavdb.  Current nanovdb probably can't be
+renamed.
+
+Dan will investigate making this PR.  
+
+4) TAC Update
+
+Presented Greg's work.  There is interest in Greg presenting for ASWF.
+
+5) Mathematica
+
+A strong desire for precompiled binaries.  How can they be hosted?
+Windows and Mac are more tractable targets.  No objection in principle
+to this, the lack of pre-compiled is probably more that packages like
+Houdiini naturally ship with them.
+
+6) vdb_view
+
+Kolton Yager presented their work in porting.
+
+vdb_view has been ported to Vulkan from GL.  Ported everything except
+clipping planes.  
+
+Other changes: dark theme, linear RGB, MSAA, better FPS, DPI scale of
+text.
+
+Same performance as GL at the end as rendering is too slow.
+
+6000 lines in addition to the existing 4000.  1000 are comments.  4000
+in separate C++ files.   200 lines of shader code.
+
+Should this be adopted?  No performance improvement.  OpenGL should
+keep working for a long time?  This may add more build issues.  Maybe the GL
+improvements could go in?
+
+There was interest in the idea of there being a Vulkan example,
+possibly as a separate tool so as to not complexify the base build.
+
+7) VDB Clip
+
+VDB clip by frustum was looking like a bug.  PR incoming for better
+documentation on this.  Some RFEs on this also incoming.
+
+8) Next meeting
+
+Next meeting is on January 30th, 2024. 2pm-3pm EST (GMT-5)
diff --git a/tsc/meetings/2024-02-06.md b/tsc/meetings/2024-02-06.md
new file mode 100644
index 0000000000..dae626664b
--- /dev/null
+++ b/tsc/meetings/2024-02-06.md
@@ -0,0 +1,73 @@
+Minutes from OpenVDB TSC meeting, February 06, 2024
+
+Attendees: *Jeff* L., *Andre* P, *Ken* M., *Greg* H., *Dan* B., *Rich* J.
+
+Additional Attendees: 
+
+Regrets:
+
+Agenda:
+
+1) Confirm quorum
+2) Secretary
+3) PRs
+4) Major refactoring in Nano
+
+------------
+
+
+1) Confirm quorum
+
+Quorum is present.
+
+
+2) Secretary
+
+Secretary is Greg Hurst.
+
+
+3) PRs
+
+1762: Vector fog volumes throw a printf to the console whenever one is created
+
+Remove #if 1 fprintf(stderr,"Warning: ....
+
+1760: Adaptive grid placeholder.
+Simple grid: A background value only grid -- extendable to dense grids, etc.
+
+adaptive name space and in directory openvdb/adaptive
+Rename GridTypes to SparseGridTypes and adds AdaptiveGridTypes too. 
+Instead should we do AllGridTypes and keep GridTypes unchanged OR do we deprecate the GridTypes alias?
+The name 'GridType' is confusing... it really is a value type for a grid
+
+Introduces TreeTraits -- determine if a grid is sparse, adaptive, etc.
+e.g.
+
+if constexpr (TreeTraits<TreeT>::IsSparse) {
+    // sparse implementation
+} else if constexpr (TreeTraits<TreeT>::IsAdaptive) {
+    // adaptive implementation
+}
+OPENVDB_THROW(NotImplementedError, "");
+
+Do we have an entry method and then a header file for each grid type? 
+e.g. openvdb/tools/Count.h has an entry method that calls openvdb/tools/sparse/Count.h, etc
+
+How many tools will/should support multiple grid types? Andre has a list of the ~200 tools to sift through
+certainly samplers
+Let's add in support as needed
+API and user include paths need to remain the same then implementation details aren't as important
+
+Proposed types:
+Sparse (OG grids)
+Adaptive
+Dense
+Procedural/Implicit
+
+Invite back to VTT team and walk them through these ideas and attempt
+(Change example from memUsage to sampler on a flat list of points)
+
+4) Major refactoring in Nano
+ 
+Use of namespaces that emulate OpenVDB and accompanying subdirectories.
+Affects client code, but there's a script that can help alleviate.
diff --git a/tsc/meetings/2024-02-13.md b/tsc/meetings/2024-02-13.md
new file mode 100644
index 0000000000..00271d520a
--- /dev/null
+++ b/tsc/meetings/2024-02-13.md
@@ -0,0 +1,58 @@
+Minutes from OpenVDB TSC meeting, February 13th, 2024
+
+Attendees: *Jeff* L., *Andre* P, *Dan* B., *Greg* H.
+
+Additional Attendees: 
+
+Regrets: *Ken* M., *Rich* J.
+
+Agenda:
+
+1) Confirm quorum
+2) Secretary
+3) Greg GitHub Write Access
+4) Half Grid Support
+5) PR Review
+6) Issues/Discussion Review
+7) Next meeting
+
+------------
+
+1) Confirm quorum
+
+Quorum is present.
+
+2) Secretary
+
+Secretary is Dan Bailey.
+
+3) Greg GitHub Write Access
+
+Greg to contact John Mertic / LF to be added to ASWF members, then can be granted OpenVDB write
+access. Email address is configurable within GitHub account.
+
+4) Half Grid Support
+
+Andre to work on serialization support next. After some discussion this work will be pushed to a 
+feature branch so that Greg and Andre can collaborate. Existing PRs to be re-targetted to merge
+into this feature branch, no requirement for code review merging this branch, no need for Greg to
+have write access.
+
+5) PR Review
+
+#1763 - Looks correct, Dan to approve. Reported and fixed by USD.
+#1755 - Needs DCO and CLA. Jeff to reply.
+#1753 - Needs more discussion with Nick and Ken.
+#1751 - Feedback has been addressed, Dan to approve and merge.
+
+6) Issues/Discussion Review
+
+#1765 - Confirm that nothing has changed on our end. Need more information, Dan or Nick to reply.
+#1764 - Valid request, Dan to reply inviting a contribution.
+#1748 - Interesting suggestion, no known real world application. Greg to reply.
+#1745 - Seems like some build configuration information missing. Andre to attempt to replicate and
+then reply.
+
+7) Next meeting
+
+Next meeting is on February 20th, 2024. 2pm-3pm EST (GMT-5)
diff --git a/tsc/meetings/2024-03-12.md b/tsc/meetings/2024-03-12.md
new file mode 100644
index 0000000000..5a3aac76b7
--- /dev/null
+++ b/tsc/meetings/2024-03-12.md
@@ -0,0 +1,76 @@
+Minutes from OpenVDB TSC meeting, March 12th, 2024
+
+Attendees: *Ken* M., *Jeff* L., *Andre* P, *Dan* B., *Greg* H., *Richard* J.
+
+Additional Attendees: Ivo (Autodesk), John Mertic (Linux Foundation), Dhruv Govil (Apple)
+
+Regrets: *Nick* A.
+
+Agenda:
+
+1) Confirm quorum
+2) Secretary
+3) Moving to Apache 2.0 License
+4) PR-1760
+5) Half-Grid Support Presentation by Greg
+6) Next meeting
+
+------------
+
+1) Confirm quorum
+
+Quorum is present.
+
+2) Secretary
+
+Secretary is Andre Pradhana.
+
+3) Moving to Apache 2.0 License
+
+We decided to try to move from Mozilla Public License to Apache 2.
+
+The key part of the process is that you cannot change the entire license until
+every contributor is on board.
+
+The reason we are moving to Apache 2 are: (1) many other ASWF projects are using
+it, (2) it protects the project from patents disputes. The disadvantages of MPL
+are: (1) any contributor can pull out their contributor and (2) the difficulty to
+reliable proof that what is in the system comes from a particular source code.
+
+Ken will share a google doc with a summary on why we are moving to Apache 2.0.
+We will use this draft to communicate to contributors to sign their approval for
+moving to Apache 2.0.
+
+John Mertic will contact DreamWorks to inform them of this decision and to
+follow through with the proces.
+
+It is noted that what we need are contributors after the year 2018, since the
+previous contributors are covered by DreamWorks.
+
+4) PR-1760
+Dan talks about Initial Adaptive Grid Prototype (PR-1760).
+
+Dan tried to use the Point Advection class and go through the interpolation
+through a different type of Grid. To make it work, Dan needs to modify the use of
+Value Accessor, which was designed for Sparse Grid. He changed it to use an
+Accessor from the Tree class itself. Essentially, this is pushing the creation of
+an Accessor down into the Tree.
+
+All the unit tests pass.
+
+Dan asks for people to look at the PR and to provide feedback. He will try to
+fix the Windows build problem. He suggests that we notify AutoDesk of this PR.
+
+5) Half-Grid Support Presentation by Greg
+
+Greg extended the support of HalfGrid to other tools, such as
+CreateLevelSetSphere, FastSweeping, LevelSetFilter, LevelSetMeasure,
+LevelSetMorph, LevelSetSphere, LevelSetTracker, LevelSetUtil, MeshToVolume,
+RayIntersector, RayTracer, SignedFloodFill. He experiments with LevelSetFilter.
+
+We discuss ValueType and ComputeType, which is similar to StoreType and
+ReturnType.
+
+6) Next meeting
+
+Next meeting is on March 19th, 2024. 2pm-3pm EST (GMT-5).
diff --git a/tsc/meetings/2024-03-19.md b/tsc/meetings/2024-03-19.md
new file mode 100644
index 0000000000..c883bbad2a
--- /dev/null
+++ b/tsc/meetings/2024-03-19.md
@@ -0,0 +1,80 @@
+Minutes from OpenVDB TSC meeting, March 19th, 2024
+
+Attendees: *Jeff* L., *Andre* P, *Dan* B., *Greg* H., *Rich* J.
+
+Additional Attendees: 
+Dhruv Govil, Ivo Kondapaneni
+
+Regrets: *Ken* M., *Nick* A.
+
+Agenda:
+
+1) Confirm quorum
+2) Secretary
+3) OpenVDB Half Grids
+4) Load as Half Patch
+5) Next meeting
+
+------------
+
+1) Confirm quorum
+
+Quorum is present.
+
+2) Secretary
+
+Secretary is Jeff Lait.
+
+3) OpenVDB Half Grids
+
+Support for half without a compute type is much slower and has
+significant errors.  Proposal is to add a Tree::ComputeType that is
+identity with ValueType, but float for Half.
+
+For many tools, like stencils or volumes, should they return Compute
+Type?  Does this break ABI?  It shouldn't as the typedef should decay
+to the base type for mangling?  We could use an enable if to work
+around if it somehow does mess up ABI.
+
+What should be compute types?  
+
+Stencils have a value type that is there value type, different from
+the tree's value type, so they can logically set this to the tree's
+ComputeType.
+
+How to get accessors or setters to use ComputeType?  We could have
+getComputeValue or getValue for the two types.   But widening is free?
+
+Should I be able to make stencils any compute type?  With the tree's
+compute type just the default?  Eventually do we want float grids with
+a compute type of double?  Probably yes, but beyond this PR.
+
+Import and export doesn't conflict with save float as half - they are
+independent and this flag is ignored with half.
+
+What half type do we use? All of our half types are the same
+underlying bit pattern so it doesn't matter.
+
+4) Arnold support for Half
+
+How to load a float grid as a half grid.  Add a scalarConversion type
+for what the new grid's type should be.  
+
+We set that to the stream's metadata, and the grid descriptor uses
+that.  This creates a CreatingReaderFactory to read the source and
+write out as the desired type.  
+
+Raw is.read() with reinterpret are replaced by the converting reader
+that might do some casting internally.
+
+seek should be seekElement() and take the count and position to make
+it clear what it is doing.
+
+Delayed loading will work as the stream has the converter attached to
+it.
+
+Will be made as a PR for further discussion.
+
+5) Next meeting
+
+Next meeting is on March 26th, 2024. 2pm-3pm EDT (GMT-4)
diff --git a/tsc/meetings/2024-04-06.md b/tsc/meetings/2024-04-06.md
new file mode 100644
index 0000000000..bf7d78e53f
--- /dev/null
+++ b/tsc/meetings/2024-04-06.md
@@ -0,0 +1,84 @@
+Minutes from OpenVDB TSC meeting, April 06, 2024
+
+Attendees: *Jeff* L., *Andre* P, *Ken* M., *Greg* H., *Dan* B.,
+
+Additional Attendees: Nishith Singhai
+
+Regrets:
+
+Agenda:
+
+1) Confirm quorum
+2) Secretary
+3) Half Grid & Grid registration
+4) PR 1780
+5) retooling IO
+6) Adaptive grids
+7) xz
+
+------------
+
+
+1) Confirm quorum
+
+Quorum is present.
+
+
+2) Secretary
+
+Secretary is Greg Hurst.
+
+
+3) Half Grid
+
+Andre will merge his work into master first
+ghurst will then retool his branch too
+Autodesk is working from Andre's branch too for half IO conversions
+
+We should add Vec3HGrid
+
+Other Vec2XXX grids, we might not want to register in the openvdb repo
+
+We should be willing to add IO but not instantiation by default
+
+
+4) PR 1780
+
+Ivo presented 2 weeks ago
+
+Question:
+Is it worth to expose convert to half grid only first?
+A lot of improvements could be made to IO in general, so it might make sense to start retooling here with a leaner implementation.
+
+Answers:
+But it's already 'done' and could influence how we want to retool the IO...
+It is also much more efficient to do a JIT conversion during import.
+
+
+5) retooling IO
+
+Need to do some benchmarking to determine if it's worth retooling IO
+
+Is there a way to merge vdb and nvdb into one file format?
+
+What about adaptive, dense, etc.
+
+No multipassing (multiple traversals of the tree to export), and so you work against that. i.e. you must write topology then data.
+
+It's because the writers are on the tree because the methods need to be virtual. And so you can just write out certain internal nodes, etc.
+
+
+6) Adaptive grids
+
+We should decide if we're going to pitch to Autodesk the prototype that's put together.
+
+Looking over PR 1760 again...
+
+
+7) xz
+
+SSH vulnerability since xz is compromised
+
+Consequences for OpenVDB?
+
+Treat external vdbs as suspect, and therefore we 'round-trip' import/export vdb or recreate the vdb ourselves? So binary vdbs being read for bug submissions / unit-tests.
diff --git a/tsc/meetings/2024-04-23.md b/tsc/meetings/2024-04-23.md
new file mode 100644
index 0000000000..144d9cd061
--- /dev/null
+++ b/tsc/meetings/2024-04-23.md
@@ -0,0 +1,60 @@
+Minutes from OpenVDB TSC meeting, April 23rd, 2024
+
+Attendees: *Jeff* L., *Andre* P, *Dan* B., *Greg* H., *Ken* M.
+
+Additional Attendees: 
+Rabih Masri (Solitons, Inc: building an ultra-realistic underwater metaverse)
+
+Regrets: *Nick* A., *Rich* J.
+
+Agenda:
+
+1) Confirm quorum
+2) Secretary
+3) Bifrost team
+4) PR 1793
+5) PR 1794
+6) PR 1796
+7) fVDB
+
+------------
+
+1) Confirm quorum
+
+Quorum is present.
+
+2) Secretary
+
+Secretary is Greg Hurst.
+
+3) Bifrost team
+
+Confirmed we have sent them relevant info pertaining to adaptive grids and are waiting to hear back
+
+4) PR 1793
+
+Fillet filter based off LevelSetFilter
+
+"Clamped min-principle curvature flow", so it's similar to mean-curvature flow
+
+Appears in page 204 on 'Level Set Methods and Fast Marching Methods by J.A. Sethian'
+
+indefinite iterations convereges to convex hull
+
+5) PR 1794
+
+Quick fix to prevent integer overflow in NodeManager and LeafManager when iterating over large grids
+
+6) PR 1796
+
+guarding UINT64_C in nanovdb (for NVRTC users who have it defined already)
+
+7) fVDB
+
+Future project to be open sourced in the OpenVDB repo
+
+Will get its own folder, similar to Houdini, Maya, Wolfram, etc.
+
+Stands for "feature VDB" and uses to ML to reconstruct geometry from point clouds
+
+PyTorch dependancy
diff --git a/tsc/meetings/2024-04-30.md b/tsc/meetings/2024-04-30.md
new file mode 100644
index 0000000000..dd4439b7d7
--- /dev/null
+++ b/tsc/meetings/2024-04-30.md
@@ -0,0 +1,67 @@
+Minutes from OpenVDB TSC meeting, April 30th, 2024
+
+Attendees: *Ken* M., *Dan* B., *Greg* H., *Rich* J., *Andre* P
+
+Additional Attendees:
+Matthew Cong (NVIDIA), Alexandre Sirois-Vigneux (SideFX),
+Efty Sifakis (Univ. Wisconsin), Francis Williams (NVIDIA),
+Jonathan Schwartz (NVIDIA), Michiel Hagedoorn
+Dhruv Govil (Apple), Tom (Sidefx), Rayhaan Tanweer,
+Rabih, Youmna, Shahan N
+
+Regrets: *Jeff* L., *Nick* A.
+
+Agenda:
+
+1) Confirm quorum
+2) Secretary
+3) Migration from PyBind11 to NanoBind
+4) Greg's ASWF membership
+5) FVDB
+6) Next meeting
+
+------------
+
+1) Confirm quorum
+
+Quorum is present.
+
+2) Secretary
+
+Secretary is Andre Pradhana.
+
+3) Migration from PyBind11 to NanoBind
+
+Matthew Cong presented a solution to handle NanoBind dependency
+by using pip. He has done work on the NanoBind-side to allow this workflow.
+The solution with Git-subtree/submodule is brittle because it
+can run into firewall issues.
+
+It was re-iterated that NanoBind is preferred because of zero-interop
+on the GPU side.
+
+Dhruv Govil pointed out that PyBind is used by other projects
+for its support for multiple inheritance (which NanoBind doesn’t
+support).
+
+4) Greg's ASWF membership
+
+Greg will follow up with John Mertic to be added to ASWF organization.
+
+5) FVDB
+
+NVIDIA team presented a presentation on fVDB, a project that is for
+consideration to be adopted by OpenVDB project. It is a framework to
+do spatial reasoning on 3D volumetric dataset, which includes deep-
+learning.
+
+The main dependencies is pytorch. The project will live in its own
+directory, parallel to the `openvdb` directory.
+
+Ken will bring up the need for GPU-support in the CI in the TAC meeting.
+TSC members will be added to the private fVDB repository for further
+investigation. Jonathan Schwartz provided us with documentation.
+
+6) Next meeting
+
+Next meeting is on May 7th, 2024. 2pm-3pm EDT (GMT-4)
diff --git a/tsc/meetings/2024-05-21.md b/tsc/meetings/2024-05-21.md
new file mode 100644
index 0000000000..243057dff8
--- /dev/null
+++ b/tsc/meetings/2024-05-21.md
@@ -0,0 +1,72 @@
+Minutes from OpenVDB TSC meeting, May 21st, 2024
+
+Attendees: *Jeff* L., *Andre* P, *Dan* B., *Greg* H., *Rich* J, *Ken*
+M..
+
+Additional Attendees: 
+Rabih, Matthew Cong (NVidia), Jonathan Swartz, Francis Williams.
+
+Regrets: *Nick* A.
+
+Agenda:
+
+1) Confirm quorum
+2) Secretary
+3) TAC Update
+4) PR 1807 & CI Updates
+5) fVDB Updates
+6) Next meeting
+
+------------
+
+1) Confirm quorum
+
+Quorum is present.
+
+2) Secretary
+
+Secretary is Jeff Lait.
+
+3) TAC Update
+
+Proposed NanoBind and asked for GPUs on CI runners
+
+Apparently we already have a service for GPUs, which is paid for, but
+the ASWF seems fine with that.  Apparently used by ColorIO, so we can
+ask them for advice.  We might want to avoid using GPU machines for
+the build?  But that would require build artifacts.  There is some
+experience at NVidia splitting the GPU and CPU CI tests.  Ken will get
+instructions on how to run on at all, that we'll test with fVDB first.
+
+There is a CI slack channel we can go to for more aid as well.
+
+NanoBind was presented to the TAC as well.  Positive feedback.  They
+seem okay with us using it moving forward.  It is also suggested other
+projects will move to Nanobind.  
+
+Nanobind does not provide an interpreter, but can be called from a
+host's own interpreter.
+
+4) PR 1807 & CI Updates
+
+Mac runner has to update to latest version.
+
+Cuda compiler is failing with 11.5.  This is Oct 2021, so we should
+drop support?  Matthew Cong will take a look at it.
+
+Missing Changelist, which Andre will provide. 
+
+5) fVDB updates
+
+Who should review this?  Do we avoid NVidia-centered review?  So long
+as it is supported and git hub issues are supported not much concern.
+
+The TSC members are needed to sign off, but do not have to be the
+"actual" reviewers. These can be reviewed by non-TSC members, and TSC
+members can approve the reviews.
+
+Initial version should be 0.X as API can change.
+
+6) Next meeting
+
+Next meeting is on May 28th, 2024. 2pm-3pm EDT (GMT-4)
diff --git a/tsc/meetings/2024-06-25.md b/tsc/meetings/2024-06-25.md
new file mode 100644
index 0000000000..9737d82001
--- /dev/null
+++ b/tsc/meetings/2024-06-25.md
@@ -0,0 +1,75 @@
+Minutes from OpenVDB TSC meeting, June 25th, 2024
+
+Attendees: *Ken* M., *Jeff* L., *Greg* H., *Rich* J, *Andre* P.
+
+Regrets: *Nick* A., *Dan* B.
+
+Additional Attendees:
+Dhruv Govil (Apple), Jonathan Swartz (NVIDIA), Francis Williams (NVIDIA),
+John Mertic (The Linux Foundation), J. T. Nelson (Blender)
+
+Agenda:
+
+1) Confirm quorum
+2) Secretary
+3) CLA Language Option
+4) License change PRs
+5) NanoVDB Review
+6) fVDB PR
+7) Half Grid
+8) SIGGRAPH OpenVDB BoF (Birds of a Feather)
+9) Next Meeting
+
+------------
+
+1) Confirm quorum
+
+Quorum is present.
+
+2) Secretary
+
+Secretary is Andre Pradhana.
+
+3) CLA Language Option
+
+John Mertic brings up the possibility to update the CLA (Contributor
+License Agreement) for the OpenVDB project: one is using the Linux-Foundation
+short form and then other one using Apache-style CLA. The advantage of moving
+to a new CLA is to align the project with other organizations, reducing
+possible friction for contributors to contribute. Ken will send an email to the
+private TSC mailing address and we will take a vote.
+
+4) License change PRs
+
+Ken and Andre will work on merging more relicensing PR.
+
+Dhruv will write the script that will change the license identifier
+from MPL 2.0 license to Apache 2.0. He will submit a PR.
+
+We need ILM sign-off for license change.
+
+5) NanoVDB Review
+
+Andre is trying to get unblock on the failing NanoVDB Windows CI.
+
+Andre suggests a name change from nanovdb::callNanoGrid to nanovdb::processTypedGrid
+to reflect what is available on openvdb.
+
+6) fVDB PR
+
+We want to get fVDB to merge in before SIGGRAPH. We need more progress
+in the fVDB Code Review.
+
+7) Half Grid
+
+Greg asks for more progress on the Half Grid PR. So far, Greg has been battle-testing
+the Half-Grid PR and found no issues with it. Andre will write more unit tests after he
+is done with the NanoVDB PR.
+
+8) SIGGRAPH OpenVDB BoF (Birds of a Feather)
+
+SIGGRAPH OpenVDB BOF is on Monday at 9 a.m.
+
+9) Next Meeting
+
+Next meeting is on July 2nd, 2024. 2pm-3pm EDT (GMT-4).
diff --git a/tsc/meetings/2024-07-02.md b/tsc/meetings/2024-07-02.md
new file mode 100644
index 0000000000..94ba460eac
--- /dev/null
+++ b/tsc/meetings/2024-07-02.md
@@ -0,0 +1,65 @@
+Minutes from OpenVDB TSC meeting, July 2nd, 2024
+
+Attendees: *Jeff* L., *Greg* H., *Rich* J, *Ken* M., *Andre* P.
+
+Regrets: *Nick* A., *Dan* B.
+
+Additional Attendees:
+Dhruv Govil (Apple), Jonathan Swartz (NVIDIA), Francis Williams (NVIDIA),
+JT Nelson (Blender)
+
+Agenda:
+
+1) Confirm quorum
+2) Secretary
+3) CLA Voting
+4) PR for License Change
+5) NanoVDB Code Review
+6) fVDB PR
+7) Siggraph BOF
+8) Blender
+9) Next Meeting
+
+------------
+
+1) Confirm quorum
+
+Quorum is present.
+
+2) Secretary
+
+Secretary is Andre Pradhana.
+
+3) CLA Voting
+
+We voted to move the our CLA to the one used by most other ASWF
+projects.
+
+4) License Change PR.
+
+Dhruv puts a PR for updating the license change: [PR-1858](https://github.com/AcademySoftwareFoundation/openvdb/pull/1858).
+
+5) NanoVDB Code Review
+
+Linux CI is currently not passing. Suggests a named change from
+callNanoGrid to nanovdb::processTypedGrid since it provides the
+same functionality as the openvdb counterpart.
+
+6) fVDB PR
+
+fVDB Branch needs to be public in github and it should be visible.
+The first step is to reopen the fVDB PR. Need to add link to the
+fVDB documentation in the OpenVDB website. We need CI to build docs
+for fVDB.
+
+7) Siggraph BOF
+
+SIGGRAPH BOF is Monday, July 29 at 9 a.m.
+
+8) Blender
+
+OpenVDB in blender is working with Open3D Engine.
+
+9) Next Meeting
+
+Next meeting is on July 9th, 2024. 2pm-3pm EDT (GMT-4).
diff --git a/tsc/meetings/2024-08-27.md b/tsc/meetings/2024-08-27.md
new file mode 100644
index 0000000000..276a9005ec
--- /dev/null
+++ b/tsc/meetings/2024-08-27.md
@@ -0,0 +1,117 @@
+Minutes from OpenVDB TSC meeting, August 27th, 2024
+
+Attendees: *Jeff* L., *Greg* H., *Rich* J, *Ken* M., *Andre* P., *Dan* B.
+
+Regrets: *Nick* A., *Ken* M.
+
+Additional Attendees:
+Dhruv Govil (Apple), Jonathan Swartz (NVIDIA), Barry Dempsey
+
+Agenda:
+
+1) Confirm quorum
+2) Secretary
+3) VDB 12 Release Items
+4) License Changes
+5) Maintainer List
+6) CI Issues
+7) PRs to merge
+8) Next meeting
+
+------------
+
+1) Confirm quorum
+
+Quorum is present.
+
+2) Secretary
+
+Secretary is Jeff Lait.
+
+3) VDB 12 Release Items
+
+a) Half support for v12
+
+Should half grid be a REAL grid?  Ie, should the macros for REAL grid
+type include it?  Conclusion was they should not.  Instead a
+higher-level named macro should contain Half.
+
+With this decided it is a v12 candidate.  Proxy grid test is
+failing, but other than that seems good.  The problem is we can't
+instantiate a proxy tree at the moment.
+
+b) fVDB
+
+Will be a feature branch.   Can we have a cadence separate from main VDB?
+Can binaries be released on a different place, like Anaconda?  
+
+NanoVDB has a separate release system.  
+
+Tags can be created on branches to mark a release point.
+
+This would require the fVDB release to be based against a stable
+version of VDB.  
+
+Currently NanoVDB and fVDB are tightly coupled - improvements to
+NanoVDB are coming from fVDB.  This suggests NanoVDB needs to match
+fVDB.
+
+4) License Changes
+
+Still trying to set up the second repo to verify CLAs.
+
+5) Maintainer List
+
+Unanimous for changing Committer to Maintainer
+
+Unanimous that it is two maintainers to commit.
+
+Unanimous that all current TSC member are also maintainers.
+
+Unanimous that Jonathan Swartz to be added as a maintainer.
+
+Jeff will attempt to update the relevant policy docs.
+
+6) CI Issues
+
+No good tricks to speed up CI debugging. Can sometimes turn off
+unnecessary runners, but some runners just take a long time until the
+first possible error.
+
+The secret is for the Houdini download.
+
+cudann download seemed to be failing.  This is likely not from our
+runner side.
+
+Need notes for the PR for CI to explain why some of the unusual
+solutions are present.
+
+Why certain versions of clang?  This is likely to try to match the g++
+of the VFX platform, but we are not sure.  The docker images came from
+the ASWF.  We have clang at all to catch more issues than g++ alone.
+
+VDB12 will likely drop support for gcc9.
+
+7) PRs to merge
+
+a) 744 - remove boost any
+
+Someone must approve by next meeting or it will be approved then.
+
+b) 1723 - boost conversion traits
+
+Waiting on the Half PR?  We should take it out of the Half PR so this
+can get in, Half PR can rebase on top of the resulting change.
+
+c) 1789, 1775 - Makes adaptive grid easier
+
+1789 is needed so you can use the same Grid with different custom Trees.
+
+1775 explores all possible instantiations and ensures they are handled.
+
+These are both on schedule to be decided on by the next meeting.
+
+8) Next Meeting
+
+Next meeting is on September 10th, 2024. 2pm-3pm EDT (GMT-4).
+
diff --git a/tsc/process/codereview.md b/tsc/process/codereview.md
index 418e81d783..5cb7a0c015 100644
--- a/tsc/process/codereview.md
+++ b/tsc/process/codereview.md
@@ -1,24 +1,24 @@
 **Code Reviewing and Merging OpenVDB Pull Requests**
 
-The Technical Steering Committee have write-access to the OpenVDB repository and are responsible for triaging, reviewing and ultimately merging or rejecting GitHub pull requests. This document lists the policy and best practices to guide the TSC in this process.
+The Maintainers have write-access to the OpenVDB repository and are responsible for triaging, reviewing and ultimately merging or rejecting GitHub pull requests. This document lists the policy and best practices to guide the Maintainers in this process.
 
 ***Policy***
 
 * No direct commits to the master (or any other protected) branch, every code change should be a pull request
 * Any member of the community can provide feedback on any pull request (and is encouraged to do so)
 * A CODEOWNERS file introduced to the root of the repo to configure ownership (global, per-directory and/or per-file) - this will automatically request pull request reviews from the relevant maintainers (https://help.github.com/articles/about-codeowners/)
-* Minimum of one non-author TSC member approval on every pull request before merging
+* Minimum of one non-author Maintainer member approval on every pull request before merging
 * Non fast-forward merges must be used (ie the merge must not be rebased onto master)
 * Travis CI and DCO status checks must strictly pass before merging, ASWF Jenkins CI should loosely pass (https://help.github.com/articles/types-of-required-status-checks)
 
 ***Best Practices***
 
 * Prefer all requested reviewers to approve before merging
-* Merging a pull request should be the responsibility of the author if they are a TSC member
-* Any TSC member can merge a pull request authored by a non-TSC member, but with a preferred minimum of two approvals from TSC members (including themself)
+* Merging a pull request should be the responsibility of the author if they are a Maintainer member
+* Any Maintainer member can merge a pull request authored by a non-Maintainer member, but with a preferred minimum of two approvals from Maintainer members (including themself)
 * Re-writing the branch history by rebasing a pull request branch just before a merge is discouraged, unless it significantly improves the overall history (such as any broken commits on the review branch that might make reverting or bisecting more difficult)
 * Prefer pull requests to be open for a minimum of 72 hours before merging in order to gather any feedback
-* Aim for all pull requests to be responded to by one of the TSC members within a minimum of two weeks with either explanation of non-acceptance, request for changes or merge
+* Aim for all pull requests to be responded to by one of the Maintainer members within a minimum of two weeks with either explanation of non-acceptance, request for changes or merge
 * TSC meetings should regularly review and discuss any outstanding pull requests
 * Pull requests should link to the associated Jira ticket (if applicable) in the description or title, this provides a route back to the Jira ticket through the Git history (git blame -> PR merge -> Jira ticket)
 * All pull request comments should aim to be answered and resolved before committing