From a812cd482f777173d338127b62ecca9743726acf Mon Sep 17 00:00:00 2001
From: DerrickYLJ <99985904+DerrickYLJ@users.noreply.github.com>
Date: Tue, 18 Jul 2023 17:26:37 -0400
Subject: [PATCH 1/4] Build Docker images for multiple CUDA versions (#859)

* para docker build cuda

* run.sh version specify

* running right cuda version

* env argument added

* simplify docker version

* code simplify and autodetect

* bug fix

* bug fix

* autodetection

* bug fix

* fixes

* fix

* publish.sh fixes, update workflow

* restore support for mt5

* update scripts and README

---------

Co-authored-by: Gabriele Oliaro <goliaro@cs.cmu.edu>
---
 .github/workflows/docker-build.yml | 35 ++++++++---
 docker/README.md                   | 26 ++++----
 docker/build.sh                    | 98 ++++++++++++------------------
 docker/flexflow/Dockerfile         |  4 +-
 docker/publish.sh                  | 88 ++++++++++++---------------
 docker/pull.sh                     | 50 +++++++++++++--
 docker/run.sh                      | 70 +++++++++++++++------
 7 files changed, 216 insertions(+), 155 deletions(-)

diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
index 477a47c7dc..40c86c1600 100644
--- a/.github/workflows/docker-build.yml
+++ b/.github/workflows/docker-build.yml
@@ -25,6 +25,21 @@ jobs:
     strategy:
       matrix:
         gpu_backend: ["cuda", "hip_rocm"]
+        cuda_version: ["11.1", "11.2", "11.3", "11.5", "11.6","11.7", "11.8"]
+        exclude:
+          - gpu_backend: "hip_rocm"
+            cuda_version: "11.2"
+          - gpu_backend: "hip_rocm"
+            cuda_version: "11.3"
+          - gpu_backend: "hip_rocm"
+            cuda_version: "11.5"
+          - gpu_backend: "hip_rocm"
+            cuda_version: "11.6"
+          - gpu_backend: "hip_rocm"
+            cuda_version: "11.7"
+          - gpu_backend: "hip_rocm"
+            cuda_version: "11.8"
+        
       fail-fast: false
     steps:
       - name: Checkout Git Repository
@@ -38,30 +53,34 @@ jobs:
       - name: Build Docker container
         env:
           FF_GPU_BACKEND: ${{ matrix.gpu_backend }}
+          cuda_version: ${{ matrix.cuda_version }}
         run: |
-          # On push to master, build for all compatible architectures, so that we can publish 
+          # On push to inference, build for all compatible architectures, so that we can publish 
           # a pre-built general-purpose image. On all other cases, only build for one architecture
           # to save time.
-          if [[ ( ${{ github.event_name }} == 'push' || ${{ github.event_name }} == 'schedule' ) && ${GITHUB_REF#refs/heads/} == "master" ]]; then
+          if [[ ( ${{ github.event_name }} == 'push' || ${{ github.event_name }} == 'schedule' ) && ${GITHUB_REF#refs/heads/} == "inference" ]]; then
             export FF_CUDA_ARCH=all
           else
             export FF_CUDA_ARCH=70
           fi
-          ./docker/build.sh --image_name flexflow --cuda_version 11.8
+          ./docker/build.sh flexflow
 
       - name: Check availability of Python flexflow.core module
         if: ${{ matrix.gpu_backend == 'cuda' }}
-        run: docker run --env CPU_ONLY_TEST=1 --entrypoint /bin/bash flexflow-cuda-11.8.0:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; exit()'"
+        env: 
+          cuda_version: ${{ matrix.cuda_version }}
+        run: docker run --env CPU_ONLY_TEST=1 --entrypoint /bin/bash flexflow-cuda-${cuda_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; exit()'"
 
-      - name: Publish Docker environment image (on push to master)
+      - name: Publish Docker environment image (on push to inference)
         if: github.repository_owner == 'flexflow'
         env:
           FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }}
           FF_GPU_BACKEND: ${{ matrix.gpu_backend }}
+          cuda_version: ${{ matrix.cuda_version }}
         run: |
-          if [[ ( ${{ github.event_name }} == 'push' || ${{ github.event_name }} == 'schedule' ) && ${GITHUB_REF#refs/heads/} == "master" ]]; then
-            ./docker/publish.sh --image_name "flexflow-environment-${FF_GPU_BACKEND}" --cuda_version 11.8 
-            ./docker/publish.sh --image_name "flexflow-${FF_GPU_BACKEND}" --cuda_version 11.8
+          if [[ ( ${{ github.event_name }} == 'push' || ${{ github.event_name }} == 'schedule' ) && ${GITHUB_REF#refs/heads/} == "inference" ]]; then
+            ./docker/publish.sh flexflow-environment
+            ./docker/publish.sh flexflow
           else
             echo "No need to update Docker containers in ghrc.io registry at this time."
           fi
diff --git a/docker/README.md b/docker/README.md
index 8be6095c69..4c52a66b6a 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -5,48 +5,48 @@ This folder contains the Dockerfiles and scripts that you can use to quickly run
 You will need a machine with a NVIDIA GPU, with drivers installed. You will also need to have Docker and the [Nvidia Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#getting-started) installed on the host machine.
 
 ## Downloading a pre-built package
-The fastest way to run FlexFlow is to use one of the pre-built containers, which we update for each commit to the `master` branch. The available containers are the following, and can be found [at this link](https://github.com/orgs/flexflow/packages?repo_name=FlexFlow):
+The fastest way to run FlexFlow is to use one of the pre-built containers, which we update for each commit to the `inference` branch (the `inference` branch is currently ahead of the `master` branch). The available containers are the following, and can be found [at this link](https://github.com/orgs/flexflow/packages?repo_name=FlexFlow):
 
-* [flexflow-cuda](https://github.com/orgs/flexflow/packages/container/package/flexflow-cuda): the pre-built version of FlexFlow targeting GPUs with a CUDA backend. N.B.: currently, this container is only fully compatible with host machines that have CUDA 11.7 installed.
+* [flexflow-cuda](https://github.com/orgs/flexflow/packages/container/package/flexflow-cuda): the pre-built version of FlexFlow targeting GPUs with a CUDA backend, for CUDA versions 11.1, 11.3, 11.7, 11.2, 11.5, 11.6, and 11.8.
 * [flexflow-hip_rocm](https://github.com/orgs/flexflow/packages/container/package/flexflow-hip_rocm): the pre-built version of FlexFlow targeting GPUs with a HIP-ROCM backend.
-* [flexflow-environment-cuda](https://github.com/orgs/flexflow/packages/container/package/flexflow-environment-cuda) and [flexflow-environment-hip_rocm](https://github.com/orgs/flexflow/packages/container/package/flexflow-environment-hip_rocm): these are the base layers for `flexflow-cuda` and `flexflow-hip_rocm`. The packages are used in CI or for internal use, and contain all the dependencies needed to build/run Flexflow. N.B.: currently, the `flexflow-environment-cuda` container is only fully compatible with host machines that have CUDA 11.7 installed.
+* [flexflow-environment-cuda](https://github.com/orgs/flexflow/packages/container/package/flexflow-environment-cuda) and [flexflow-environment-hip_rocm](https://github.com/orgs/flexflow/packages/container/package/flexflow-environment-hip_rocm): these are the base layers for `flexflow-cuda` and `flexflow-hip_rocm`. The packages are used in CI or for internal use, and contain all the dependencies needed to build/run Flexflow.
 
 The easiest way to download any of the Docker containers above is to call:
 
 ```
-./docker/pull.sh <CONTAINER_NAME>
+FF_GPU_BACKEND=<YOUR_GPU_BACKEND> cuda_version=<DESIRED_CUDA_VERSION> ./docker/pull.sh <CONTAINER_NAME>
 ```
 
+where `CONTAINER_NAME` is `flexflow` (or `flexflow-environment`), and `FF_GPU_BACKEND`/`cuda_version` are optional environment variables you can use if you wish to download the docker image for a GPU backend and/or cuda version other than those installed on your machine (leaving these variables unset will let the script autodetect which version to download depending on your setup).
+
 After downloading a container you can use the `run.sh` script to run it by following the instructions in the section below.
 
 ## Building a Docker container from scratch
-If you prefer to build one of the Docker containers from scratch, you can do so with the help of the `build.sh` script. You can configure the build via the same environment variables that you'd use to configure a CMake build (refer to the [Installation guide](../INSTALL.md) and to the `config/config.linux` file). For example, to build for a CUDA backend, you can export `FF_GPU_BACKEND=cuda` (you can also omit this since `cuda` is the default value for `FF_GPU_BACKEND`).
+If you prefer to build one of the Docker containers from scratch, you can do so with the help of the `build.sh` script. You can configure the build via the same environment variables that you'd use to configure a CMake build (refer to the [Installation guide](../INSTALL.md) and to the `config/config.linux` file). For example, to build for a CUDA backend, you can export `FF_GPU_BACKEND=cuda` (you can also omit this since `cuda` is the default value for `FF_GPU_BACKEND`). When building for the `cuda` backend, you can pick the CUDA version by setting the optional environment variable `cuda_version`, e.g.: `export cuda_version=11.8`. Leaving the `cuda_version` variable blank will let the script autodetect the CUDA version installed on the host machine, and build for that version. Setting the `cuda_version` env will have no effect when building for a GPU backend other than CUDA.
 
 To build the FlexFlow container, run (the `flexflow` argument of the build script can be omitted):
 
 ```
-FF_GPU_BACKEND=<YOUR_GPU_BACKEND> ./docker/build.sh flexflow
+FF_GPU_BACKEND=<YOUR_GPU_BACKEND> cuda_version=<DESIRED_CUDA_VERSION> ./docker/build.sh flexflow
 ```
 
 If you only want to build the `flexflow-environment` image (the base layers of the `flexflow` container, used in CI and for other internal purposes), run:
 
 ```
-FF_GPU_BACKEND=<YOUR_GPU_BACKEND> ./docker/build.sh flexflow-environment
+FF_GPU_BACKEND=<YOUR_GPU_BACKEND> cuda_version=<DESIRED_CUDA_VERSION> ./docker/build.sh flexflow-environment
 ``` 
 
 ## Running a Docker container
-After having either built or downloaded a Docker container by following the instructions above, you can run it with the following command (the `flexflow` argument of the run script can be omitted):
+After having either built or downloaded a Docker container by following the instructions above, you can run it with the following command (image name argument of the run script can be omitted). Once again, you can set the `FF_GPU_BACKEND` and `cuda_version` optional environment variables to run the docker image with the desired GPU backend and CUDA version. Leaving these variables unset will instruct the script to autodetect the GPU backend and CUDA version installed on the current machine and run the Docker container with it if available.
 
 ```
-FF_GPU_BACKEND=<YOUR_GPU_BACKEND> ./docker/run.sh flexflow 
+FF_GPU_BACKEND=<YOUR_GPU_BACKEND> cuda_version=<DESIRED_CUDA_VERSION> ./docker/run.sh --image_name flexflow
 ```
 
 If you wish to run the `flexflow-environment` container, run:
 
 ```
-FF_GPU_BACKEND=<YOUR_GPU_BACKEND> ./docker/run.sh flexflow-environment
+FF_GPU_BACKEND=<YOUR_GPU_BACKEND> cuda_version=<DESIRED_CUDA_VERSION> ./docker/run.sh --image_name flexflow-environment
 ```
 
-Once again, if your backend is CUDA, you can omit the `FF_GPU_BACKEND` environment variable, since `cuda` is used as the default value.
-
-N.B.: If you don't have GPUs available on the machine, edit the `run.sh` script and set `ATTACH_GPUS=false` before running it.
+N.B.: If you don't have GPUs available on the machine, or you wish to run the docker image without attaching GPUs, you can set the environment variable `ATTACH_GPUS=false` before running the script.
diff --git a/docker/build.sh b/docker/build.sh
index a5505335b4..6ed5cbe00e 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -2,79 +2,61 @@
 set -euo pipefail
 
 # Usage: ./build.sh <docker_image_name>
+# Optional environment variables: FF_GPU_BACKEND, cuda_version
 
-# https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 # Cd into $FF_HOME. Assumes this script is in $FF_HOME/docker
-cd "$SCRIPT_DIR/.."
-
-cuda_version="empty"
-image="flexflow"
-
-# Parse command-line options and # Get name of desired Docker image and cuda version as input
-while [[ $# -gt 0 ]]; do
-  key="$1"
-
-  case $key in
-    --cuda_version)
-      cuda_version="$2"
-      shift 2
-      ;;
-    --image_name)
-      image="$2"
-      shift 2
-      ;;
-    *)
-      echo "Invalid option: $key"
-      exit 1
-      ;;
-  esac
-done
-
-if [[ $cuda_version == "empty" ]]; then
-  cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}')
-  # Change cuda_version eg. V11.7.99 to 11.7
-  cuda_version=${cuda_version:1:4}
-fi
-
-
-if [[ "$cuda_version" != @(11.1|11.3|11.5|11.6|11.7|11.8) ]]; then
-  # validate the verison of CUDA against a list of supported ones
-  # 11.1, 11.3, 11.5, 11.6, 11.7, 11.8
-  # Available versions: 11.1.1 | 11.2.2 | 11.3.1 | 11.5.2 | 11.6.2 | 11.7.1 | 11.8.0
-  echo "cuda_version is not supported, please choose among {11.1,11.3,11.5,11.6,11.7,11.8}"
-  exit 1
-fi
-
-# modify cuda version to available versions
-if [[ "$cuda_version" == @(11.1|11.3|11.7) ]]; then
-  cuda_version=${cuda_version}.1
-elif [[ "$cuda_version" == @(11.2|11.5|11.6) ]]; then 
-  cuda_version=${cuda_version}.2
-elif [[ "$cuda_version" == @(11.8) ]]; then 
-  cuda_version=${cuda_version}.0
-fi
-
+cd "${BASH_SOURCE[0]%/*}/.."
 
+# Parse input params
+image=${1:-flexflow}
+FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda}
+cuda_version=${cuda_version:-"empty"}
 
+# Check docker image name
 if [[ "$image" != @(flexflow-environment|flexflow) ]]; then
   echo "Error, image name ${image} is invalid. Choose between 'flexflow-environment' and 'flexflow'."
   exit 1
 fi
 
-# Set up GPU backend
-FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda}
+# Check GPU backend
 if [[ "${FF_GPU_BACKEND}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then
   echo "Error, value of FF_GPU_BACKEND (${FF_GPU_BACKEND}) is invalid. Pick between 'cuda', 'hip_cuda', 'hip_rocm' or 'intel'."
   exit 1
 elif [[ "${FF_GPU_BACKEND}" != "cuda" ]]; then
-  echo "Configuring FlexFlow to build for gpu backend: ${FF_GPU_BACKEND}"
+  echo "Building $image docker image with gpu backend: ${FF_GPU_BACKEND}"
+else
+  echo "Building $image docker image with default GPU backend: cuda"
+fi
+
+if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then
+  # Autodetect cuda version if not specified
+  if [[ $cuda_version == "empty" ]]; then
+    cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}')
+    # Change cuda_version eg. V11.7.99 to 11.7
+    cuda_version=${cuda_version:1:4}
+  fi
+  # Check that CUDA version is supported, and modify cuda version to include default subsubversion
+  if [[ "$cuda_version" == @(11.1|11.3|11.7) ]]; then
+    cuda_version_input=${cuda_version}.1
+  elif [[ "$cuda_version" == @(11.2|11.5|11.6) ]]; then 
+    cuda_version_input=${cuda_version}.2
+  elif [[ "$cuda_version" == @(11.8) ]]; then 
+    cuda_version_input=${cuda_version}.0
+  else
+    echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.5|11.6|11.7|11.8}"
+    exit 1
+  fi
+  # Set cuda version suffix to docker image name
+  echo "Building $image docker image with CUDA $cuda_version"
+  cuda_version="-${cuda_version}"
 else
-  echo "Letting FlexFlow build for a default GPU backend: cuda"
+  # Empty cuda version suffix for non-CUDA images
+  cuda_version=""
+  # Pick a default CUDA version for the base docker image from NVIDIA
+  cuda_version_input="11.8.0"
 fi
 
-# Build the FlexFlow Enviroment docker image with input cuda version
-docker build --build-arg "FF_GPU_BACKEND=${FF_GPU_BACKEND}" --build-arg "cuda_version=${cuda_version}" -t "flexflow-environment-${FF_GPU_BACKEND}-${cuda_version}" -f docker/flexflow-environment/Dockerfile .
+docker build --build-arg "FF_GPU_BACKEND=${FF_GPU_BACKEND}" --build-arg "cuda_version=${cuda_version_input}" -t "flexflow-environment-${FF_GPU_BACKEND}${cuda_version}" -f docker/flexflow-environment/Dockerfile .
 
 # If the user only wants to build the environment image, we are done
 if [[ "$image" == "flexflow-environment" ]]; then
@@ -132,4 +114,4 @@ fi
 # Set value of BUILD_CONFIGS
 get_build_configs
 
-docker build --build-arg "N_BUILD_CORES=${n_build_cores}" --build-arg "FF_GPU_BACKEND=${FF_GPU_BACKEND}" --build-arg "BUILD_CONFIGS=${BUILD_CONFIGS}" --build-arg "cuda_version=${cuda_version}" -t "flexflow-${FF_GPU_BACKEND}-${cuda_version}" -f docker/flexflow/Dockerfile .
\ No newline at end of file
+docker build --build-arg "N_BUILD_CORES=${n_build_cores}" --build-arg "FF_GPU_BACKEND=${FF_GPU_BACKEND}" --build-arg "BUILD_CONFIGS=${BUILD_CONFIGS}" --build-arg "cuda_version=${cuda_version}" -t "flexflow-${FF_GPU_BACKEND}${cuda_version}" -f docker/flexflow/Dockerfile .
diff --git a/docker/flexflow/Dockerfile b/docker/flexflow/Dockerfile
index 1538625489..0cda5cbc18 100644
--- a/docker/flexflow/Dockerfile
+++ b/docker/flexflow/Dockerfile
@@ -1,6 +1,6 @@
 ARG FF_GPU_BACKEND "cuda"
-ARG cuda_version
-FROM flexflow-environment-$FF_GPU_BACKEND-$cuda_version:latest
+ARG cuda_version ""
+FROM flexflow-environment-$FF_GPU_BACKEND$cuda_version:latest
 
 LABEL org.opencontainers.image.source=https://github.com/flexflow/FlexFlow
 LABEL org.opencontainers.image.description="FlexFlow container"
diff --git a/docker/publish.sh b/docker/publish.sh
index f1dbbfcb92..b8668d3c0e 100755
--- a/docker/publish.sh
+++ b/docker/publish.sh
@@ -1,62 +1,55 @@
 #! /usr/bin/env bash
 set -euo pipefail
 
+# Usage: ./publish.sh <docker_image_name>
+# Optional environment variables: FF_GPU_BACKEND, cuda_version
+
 # Cd into directory holding this script
 cd "${BASH_SOURCE[0]%/*}"
 
-cuda_version="empty"
-image="flexflow-cuda"
-
-# Parse command-line options
-while [[ $# -gt 0 ]]; do
-  key="$1"
-
-  case $key in
-    --cuda_version)
-      cuda_version="$2"
-      shift 2
-      ;;
-    --image_name)
-      image="$2"
-      shift 2
-      ;;
-    *)
-      echo "Invalid option: $key"
-      exit 1
-      ;;
-  esac
-done
+# Parse input params
+image=${1:-flexflow}
+FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda}
+cuda_version=${cuda_version:-"empty"}
 
-if [[ $cuda_version == "empty" ]]; then
-  cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}')
-  # Change cuda_version eg. V11.7.99 to 11.7
-  cuda_version=${cuda_version:1:4}
+# Check docker image name
+if [[ "${image}" != @(flexflow-environment|flexflow) ]]; then
+  echo "Error, docker image name '${image}' is invalid. Choose between 'flexflow-environment' and 'flexflow'."
+  exit 1
 fi
 
-if [[ "$cuda_version" != @(11.1|11.3|11.5|11.6|11.7|11.8) ]]; then
-  # validate the verison of CUDA against a list of supported ones
-  # 11.1, 11.3, 11.5, 11.6, 11.7, 11.8
-  echo "cuda_version is not supported, please choose among {11.1,11.3,11.5,11.6,11.7,11.8}"
+# Check GPU backend
+if [[ "${FF_GPU_BACKEND}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then
+  echo "Error, value of FF_GPU_BACKEND (${FF_GPU_BACKEND}) is invalid. Pick between 'cuda', 'hip_cuda', 'hip_rocm' or 'intel'."
   exit 1
+elif [[ "${FF_GPU_BACKEND}" != "cuda" ]]; then
+  echo "Publishing $image docker image with gpu backend: ${FF_GPU_BACKEND}"
+else
+  echo "Publishing $image docker image with default GPU backend: cuda"
 fi
 
-# modify cuda version to available versions
-if [[ "$cuda_version" == @(11.1|11.3|11.7) ]]; then
-  cuda_version=${cuda_version}.1
-elif [[ "$cuda_version" == @(11.2|11.5|11.6) ]]; then 
-  cuda_version=${cuda_version}.2
-elif [[ "$cuda_version" == @(11.8) ]]; then 
-  cuda_version=${cuda_version}.0
-fi
-
-
-if [[ "${image}" != @(flexflow-environment-cuda|flexflow-environment-hip_cuda|flexflow-environment-hip_rocm|flexflow-environment-intel|flexflow-cuda|flexflow-hip_cuda|flexflow-hip_rocm|flexflow-intel) ]]; then
-  echo "Error, image name ${image} is invalid. Choose between 'flexflow-environment-{cuda,hip_cuda,hip_rocm,intel}' and 'flexflow-{cuda,hip_cuda,hip_rocm,intel}'."
-  exit 1
+if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then
+  # Autodetect cuda version if not specified
+  if [[ $cuda_version == "empty" ]]; then
+    cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}')
+    # Change cuda_version eg. V11.7.99 to 11.7
+    cuda_version=${cuda_version:1:4}
+  fi
+  # Check that CUDA version is supported
+  if [[ "$cuda_version" != @(11.1|11.3|11.7|11.2|11.5|11.6|11.8) ]]; then
+    echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.5|11.6|11.7|11.8}"
+    exit 1
+  fi
+  # Set cuda version suffix to docker image name
+  echo "Publishing $image docker image with CUDA $cuda_version"
+  cuda_version="-${cuda_version}"
+else
+  # Empty cuda version suffix for non-CUDA images
+  cuda_version=""
 fi
 
 # Check that image exists
-docker image inspect "${image}-${cuda_version}":latest > /dev/null
+docker image inspect "${image}-${FF_GPU_BACKEND}${cuda_version}":latest > /dev/null
 
 # Log into container registry
 FLEXFLOW_CONTAINER_TOKEN=${FLEXFLOW_CONTAINER_TOKEN:-}
@@ -66,9 +59,8 @@ echo "$FLEXFLOW_CONTAINER_TOKEN" | docker login ghcr.io -u flexflow --password-s
 # Tag image to be uploaded
 git_sha=${GITHUB_SHA:-$(git rev-parse HEAD)}
 if [ -z "$git_sha" ]; then echo "Commit hash cannot be detected, cannot publish the docker image to ghrc.io"; exit; fi
-
-docker tag "${image}-${cuda_version}":latest ghcr.io/flexflow/"$image-$cuda_version":latest
-
+docker tag "${image}-${FF_GPU_BACKEND}${cuda_version}":latest ghcr.io/flexflow/"${image}-${FF_GPU_BACKEND}${cuda_version}":latest
 
 # Upload image
-docker push ghcr.io/flexflow/"$image-$cuda_version":latest
+docker push ghcr.io/flexflow/"${image}-${FF_GPU_BACKEND}${cuda_version}":latest
+
diff --git a/docker/pull.sh b/docker/pull.sh
index aa7078965e..f8624a1072 100755
--- a/docker/pull.sh
+++ b/docker/pull.sh
@@ -1,20 +1,58 @@
 #! /usr/bin/env bash
 set -euo pipefail
 
+# Usage: ./pull.sh <docker_image_name>
+# Optional environment variables: FF_GPU_BACKEND, cuda_version
+
 # Cd into directory holding this script
 cd "${BASH_SOURCE[0]%/*}"
 
-image=${1:-"flexflow-cuda"}
-if [[ "${image}" != @(flexflow-environment-cuda|flexflow-environment-hip_cuda|flexflow-environment-hip_rocm|flexflow-environment-intel|flexflow-cuda|flexflow-hip_cuda|flexflow-hip_rocm|flexflow-intel) ]]; then
-  echo "Error, image name ${image} is invalid. Choose between 'flexflow-environment-{cuda,hip_cuda,hip_rocm,intel}' and 'flexflow-{cuda,hip_cuda,hip_rocm,intel}'."
+# Parse input params
+image=${1:-flexflow}
+FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda}
+cuda_version=${cuda_version:-"empty"}
+
+# Check docker image name
+if [[ "${image}" != @(flexflow-environment|flexflow) ]]; then
+  echo "Error, docker image name '${image}' is invalid. Choose between 'flexflow-environment' and 'flexflow'."
   exit 1
 fi
 
+# Check GPU backend
+if [[ "${FF_GPU_BACKEND}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then
+  echo "Error, value of FF_GPU_BACKEND (${FF_GPU_BACKEND}) is invalid. Pick between 'cuda', 'hip_cuda', 'hip_rocm' or 'intel'."
+  exit 1
+elif [[ "${FF_GPU_BACKEND}" != "cuda" ]]; then
+  echo "Downloading $image docker image with gpu backend: ${FF_GPU_BACKEND}"
+else
+  echo "Downloading $image docker image with default GPU backend: cuda"
+fi
+
+if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then
+  # Autodetect cuda version if not specified
+  if [[ $cuda_version == "empty" ]]; then
+    cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}')
+    # Change cuda_version eg. V11.7.99 to 11.7
+    cuda_version=${cuda_version:1:4}
+  fi
+  # Check that CUDA version is supported
+  if [[ "$cuda_version" != @(11.1|11.3|11.7|11.2|11.5|11.6|11.8) ]]; then
+    echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.5|11.6|11.7|11.8}"
+    exit 1
+  fi
+  # Set cuda version suffix to docker image name
+  echo "Downloading $image docker image with CUDA $cuda_version"
+  cuda_version="-${cuda_version}"
+else
+  # Empty cuda version suffix for non-CUDA images
+  cuda_version=""
+fi
+
 # Download image
-docker pull ghcr.io/flexflow/"$image"
+docker pull ghcr.io/flexflow/"$image-${FF_GPU_BACKEND}${cuda_version}"
 
 # Tag downloaded image
-docker tag ghcr.io/flexflow/"$image":latest "$image":latest 
+docker tag ghcr.io/flexflow/"$image-${FF_GPU_BACKEND}${cuda_version}":latest "$image-${FF_GPU_BACKEND}${cuda_version}":latest 
 
 # Check that image exists
-docker image inspect "${image}":latest > /dev/null
+docker image inspect "${image}-${FF_GPU_BACKEND}${cuda_version}":latest > /dev/null
diff --git a/docker/run.sh b/docker/run.sh
index e04e7d68c1..fd7550afc8 100755
--- a/docker/run.sh
+++ b/docker/run.sh
@@ -1,42 +1,72 @@
 #! /usr/bin/env bash
 set -euo pipefail
 
+# Usage: ./run.sh <docker_image_name>
+# Optional environment variables: FF_GPU_BACKEND, cuda_version, ATTACH_GPUS, SHM_SIZE
+
 # Cd into directory holding this script
 cd "${BASH_SOURCE[0]%/*}"
 
+# Parse input params
+image=${1:-flexflow}
+FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda}
+cuda_version=${cuda_version:-"empty"}
+
 # Parameter controlling whether to attach GPUs to the Docker container
-ATTACH_GPUS=true
+ATTACH_GPUS=${ATTACH_GPUS:-true}
+gpu_arg=""
+if $ATTACH_GPUS ; then gpu_arg="--gpus all" ; fi
 
 # Amount of shared memory to give the Docker container access to
 # If you get a Bus Error, increase this value. If you don't have enough memory
 # on your machine, decrease this value.
-SHM_SIZE=8192m
+SHM_SIZE=${SHM_SIZE:-8192m}
 
-gpu_arg=""
-if $ATTACH_GPUS ; then gpu_arg="--gpus all" ; fi
-image=${1:-flexflow}
+# Check docker image name
+if [[ "$image" != @(flexflow-environment|flexflow) ]]; then
+  echo "Error, image name ${image} is invalid. Choose between 'flexflow-environment', 'flexflow'."
+  exit 1
+fi
 
-FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda}
+# Check GPU backend
 if [[ "${FF_GPU_BACKEND}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then
   echo "Error, value of FF_GPU_BACKEND (${FF_GPU_BACKEND}) is invalid. Pick between 'cuda', 'hip_cuda', 'hip_rocm' or 'intel'."
   exit 1
 elif [[ "${FF_GPU_BACKEND}" != "cuda" ]]; then
-  echo "Running FlexFlow with GPU backend: ${FF_GPU_BACKEND}"
+  echo "Running $image docker image with gpu backend: ${FF_GPU_BACKEND}"
 else
-  echo "Running FlexFlow with default GPU backend: cuda"
+  echo "Running $image docker image with default GPU backend: cuda"
 fi
 
-
-if [[ "$image" == "flexflow-environment" ]]; then
-    eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "flexflow-environment-${FF_GPU_BACKEND}:latest"
-elif [[ "$image" == "flexflow" ]]; then
-    eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "flexflow-${FF_GPU_BACKEND}:latest"
-elif [[ "$image" == "mt5" ]]; then
-    # Backward compatibility
-    eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" \
-    -v "$(pwd)"/../examples/python/pytorch/mt5/data:/usr/FlexFlow/examples/python/pytorch/mt5/data \
-    -v "$(pwd)"/../examples/python/pytorch/mt5/eng-sin.tar:/usr/FlexFlow/examples/python/pytorch/mt5/eng-sin.tar \
-    "flexflow-${FF_GPU_BACKEND}:latest"
+if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then
+  # Autodetect cuda version if not specified
+  if [[ $cuda_version == "empty" ]]; then
+    cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}')
+    # Change cuda_version eg. V11.7.99 to 11.7
+    cuda_version=${cuda_version:1:4}
+  fi
+  # Check that CUDA version is supported
+  if [[ "$cuda_version" != @(11.1|11.3|11.7|11.2|11.5|11.6|11.8) ]]; then
+    echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.5|11.6|11.7|11.8}"
+    exit 1
+  fi
+  # Set cuda version suffix to docker image name
+  echo "Running $image docker image with CUDA $cuda_version"
+  cuda_version_hyphen="-${cuda_version}"
 else
-    echo "Docker image name not valid"
+  # Empty cuda version suffix for non-CUDA images
+  cuda_version_hyphen=""
 fi
+
+# Check that image exists
+if docker image inspect "${image}-${FF_GPU_BACKEND}${cuda_version_hyphen}":latest > /dev/null || true ; then
+  echo ""
+  echo "To download the docker image, run:"
+  echo "    FF_GPU_BACKEND=${FF_GPU_BACKEND} cuda_version=${cuda_version} $(pwd)/pull.sh $image"
+  echo "To build the docker image from source, run:"
+  echo "    FF_GPU_BACKEND=${FF_GPU_BACKEND} cuda_version=${cuda_version} $(pwd)/build.sh $image"
+  echo ""
+  exit 1
+fi
+
+eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "${image}-${FF_GPU_BACKEND}${cuda_version_hyphen}:latest"

From d3cd3709a35dc939a60dc6e153cb9ccb2c3ef4f3 Mon Sep 17 00:00:00 2001
From: xinhaoc <99570243+xinhaoc@users.noreply.github.com>
Date: Tue, 18 Jul 2023 21:02:13 -0400
Subject: [PATCH 2/4] Inference: Sampling result (#854)

* init

* sort

* .

* del

* .

* finish impl.

* clean up, format, hip_rocm

* format

* .

* fix half precision.

* try torch1.

* .

* batch size

* fix

* rename GenerationConfig SamplingConfig

---------

Co-authored-by: Zhihao Jia <zhihao@cmu.edu>
---
 include/flexflow/ffconst.h               |   1 +
 include/flexflow/inference.h             |  12 +
 include/flexflow/model.h                 |   6 +
 include/flexflow/operator_params.h       |   2 +
 include/flexflow/ops/sampling.h          | 108 +++++++
 include/flexflow/ops/sampling_params.h   |  24 ++
 inference/incr_decoding/incr_decoding.cc |  25 ++
 inference/models/llama.cc                |  10 +-
 inference/models/llama.h                 |   1 +
 inference/spec_infer/spec_infer.cc       |   3 +
 src/ops/fused.cu                         |   3 +-
 src/ops/sampling.cc                      | 343 +++++++++++++++++++++++
 src/ops/sampling.cpp                     |  67 +++++
 src/ops/sampling.cu                      | 267 ++++++++++++++++++
 src/runtime/ffconst_utils.cc             |   2 +
 src/runtime/graph.cc                     |   5 +
 src/runtime/model.cc                     |  40 ++-
 src/runtime/operator_params.cc           |   3 +
 18 files changed, 919 insertions(+), 3 deletions(-)
 create mode 100644 include/flexflow/ops/sampling.h
 create mode 100644 include/flexflow/ops/sampling_params.h
 create mode 100644 src/ops/sampling.cc
 create mode 100644 src/ops/sampling.cpp
 create mode 100644 src/ops/sampling.cu

diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h
index 3d899ac91d..65fa23569b 100644
--- a/include/flexflow/ffconst.h
+++ b/include/flexflow/ffconst.h
@@ -167,6 +167,7 @@ enum OperatorType {
   OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION,
   OP_TREE_INC_MULTIHEAD_SELF_ATTENTION,
   OP_INC_MULTIQUERY_SELF_ATTENTION,
+  OP_SAMPLING,
   // Parallel Ops
   OP_REPARTITION,
   OP_COMBINE,
diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h
index a1846c96dc..0c5274e15b 100644
--- a/include/flexflow/inference.h
+++ b/include/flexflow/inference.h
@@ -65,6 +65,18 @@ struct BeamTree {
   treeLayer treeLayers[BeamSearchBatchConfig::MAX_BEAM_DEPTH + 1];
 };
 
+struct SamplingConfig {
+  bool do_sample = false;
+  float temperature = 0.8;
+  float topp = 0.6;
+  SamplingConfig(bool _do_sample, float _temperature, float _topp) {
+    temperature = _temperature > 0 ? _temperature : temperature;
+    topp = _topp > 0 ? _topp : topp;
+    do_sample = _do_sample;
+  }
+  SamplingConfig() {}
+};
+
 // struct BeamTree_v2 {
 //   std::vector<BatchConfig::TokenId> tokens;
 //   std::vector<int> parent_ids;
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 38c1cec838..3a76209b98 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -136,6 +136,8 @@ enum TaskIDs {
   TOPK_BWD_TASK_ID,
   ARG_TOPK_INIT_TASK_ID,
   ARG_TOPK_INF_TASK_ID,
+  SAMPLING_INIT_TASK_ID,
+  SAMPLING_INF_TASK_ID,
   TRANSPOSE_INIT_TASK_ID,
   TRANSPOSE_FWD_TASK_ID,
   TRANSPOSE_BWD_TASK_ID,
@@ -312,6 +314,7 @@ class RMSNorm;
 class BeamTopK;
 class SpecIncMultiHeadSelfAttention;
 class IncMultiQuerySelfAttention;
+class Sampling;
 class Combine;
 class Repartition;
 class Reduction;
@@ -612,6 +615,7 @@ class FFModel {
                    int k,
                    bool sorted,
                    char const *name = NULL);
+  Tensor sampling(const Tensor input, float top_p, char const *name = NULL);
   Tensor multihead_attention(const Tensor query,
                              const Tensor key,
                              const Tensor value,
@@ -1061,6 +1065,8 @@ class FFModel {
           IncMultiQuerySelfAttention *>,
       std::unordered_map<std::pair<ParallelTensorShape, BeamTopKParams>,
                          BeamTopK *>,
+      std::unordered_map<std::pair<ParallelTensorShape, SamplingParams>,
+                         Sampling *>,
       std::unordered_map<
           std::pair<ParallelTensorShape, SpecIncMultiHeadSelfAttentionParams>,
           SpecIncMultiHeadSelfAttention *>,
diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h
index f6918ff581..5c2101d190 100644
--- a/include/flexflow/operator_params.h
+++ b/include/flexflow/operator_params.h
@@ -26,6 +26,7 @@
 #include "flexflow/ops/reduce_params.h"
 #include "flexflow/ops/reshape_params.h"
 #include "flexflow/ops/rms_norm_params.h"
+#include "flexflow/ops/sampling_params.h"
 #include "flexflow/ops/softmax_params.h"
 #include "flexflow/ops/spec_inc_multihead_self_attention_params.h"
 #include "flexflow/ops/split_params.h"
@@ -71,6 +72,7 @@ using OperatorParameters = mp::variant<AggregateParams,
                                        SplitParams,
                                        TopKParams,
                                        ArgTopKParams,
+                                       SamplingParams,
                                        SoftmaxParams,
                                        TransposeParams,
                                        RepartitionParams,
diff --git a/include/flexflow/ops/sampling.h b/include/flexflow/ops/sampling.h
new file mode 100644
index 0000000000..8ffa6a290a
--- /dev/null
+++ b/include/flexflow/ops/sampling.h
@@ -0,0 +1,108 @@
+#ifndef _FLEXFLOW_SAMPLING_TOPK_H_
+#define _FLEXFLOW_SAMPLING_TOPK_H_
+
+#include "flexflow/inference.h"
+#include "flexflow/model.h"
+#include "flexflow/node.h"
+#include "flexflow/ops/sampling_params.h"
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+#include <curand.h>
+#include <curand_kernel.h>
+#endif
+
+namespace FlexFlow {
+
+class SamplingMeta : public OpMeta {
+public:
+  float top_p;
+  void *sorted_logits;
+  int *sorted_idx;
+  int *begin_offset;
+  int *end_offset;
+  int *idx;
+  void *d_temp_storage;
+  size_t temp_storage_bytes;
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+  curandState *state;
+#endif
+  SamplingMeta(FFHandler handle,
+               Op const *op,
+               int batch_size,
+               int total_ele,
+               GenericTensorAccessorW input);
+};
+
+class Sampling : public Op {
+public:
+  using Params = SamplingParams;
+  using Input = ParallelTensor;
+  Sampling(FFModel &model,
+           const ParallelTensor input,
+           float top_p,
+           char const *name);
+  Sampling(FFModel &model, Sampling const &other, const ParallelTensor input);
+  Sampling(FFModel &model,
+           Params const &params,
+           Input const input,
+           char const *name = nullptr);
+  void init(FFModel const &) override;
+  void init_inference(FFModel const &,
+                      std::vector<ParallelTensor> const &,
+                      std::vector<ParallelTensor> const &,
+                      MachineView const *mv = nullptr) override;
+  void forward(FFModel const &) override;
+  void backward(FFModel const &) override;
+  Legion::FutureMap inference(FFModel const &,
+                              BatchConfig const &,
+                              std::vector<ParallelTensor> const &,
+                              std::vector<ParallelTensor> const &,
+                              MachineView const *mv = nullptr) override;
+  void print_layer(FFModel const &model) override {
+    assert(0);
+  }
+  static Op *
+      create_operator_from_layer(FFModel &model,
+                                 Layer const *layer,
+                                 std::vector<ParallelTensor> const &inputs);
+
+  static OpMeta *init_task(Legion::Task const *task,
+                           std::vector<Legion::PhysicalRegion> const &regions,
+                           Legion::Context ctx,
+                           Legion::Runtime *runtime);
+  static InferenceResult
+      inference_task(Legion::Task const *task,
+                     std::vector<Legion::PhysicalRegion> const &regions,
+                     Legion::Context ctx,
+                     Legion::Runtime *runtime);
+  void serialize(Legion::Serializer &s) const override;
+  static PCG::Node deserialize(FFModel &ff,
+                               Legion::Deserializer &d,
+                               ParallelTensor inputs[],
+                               int num_inputs);
+  Op *materialize(FFModel &ff,
+                  ParallelTensor inputs[],
+                  int num_inputs) const override;
+  bool measure_operator_cost(Simulator *sim,
+                             MachineView const &pc,
+                             CostMetrics &cost_metrics) const override;
+  template <typename DT>
+  static void forward_kernel(SamplingMeta const *m,
+                             DT *input_ptr,
+                             int *indices_ptr,
+                             float top_p,
+                             int length,
+                             int batch_size,
+                             ffStream_t stream);
+  static void forward_kernel_wrapper(SamplingMeta const *m,
+                                     GenericTensorAccessorW const &input,
+                                     GenericTensorAccessorW const &indices,
+                                     int batch_size);
+  Params get_params() const;
+
+public:
+  float top_p;
+};
+
+}; // namespace FlexFlow
+
+#endif
\ No newline at end of file
diff --git a/include/flexflow/ops/sampling_params.h b/include/flexflow/ops/sampling_params.h
new file mode 100644
index 0000000000..1449ddbf54
--- /dev/null
+++ b/include/flexflow/ops/sampling_params.h
@@ -0,0 +1,24 @@
+#ifndef _FLEXFLOW_SAMPLING_PARAMS_H
+#define _FLEXFLOW_SAMPLING_PARAMS_H
+
+#include "flexflow/ffconst.h"
+#include "flexflow/parallel_tensor.h"
+
+namespace FlexFlow {
+
+struct SamplingParams {
+  float top_p;
+  bool is_valid(ParallelTensorShape const &) const;
+};
+bool operator==(SamplingParams const &, SamplingParams const &);
+
+} // namespace FlexFlow
+
+namespace std {
+template <>
+struct hash<FlexFlow::SamplingParams> {
+  size_t operator()(FlexFlow::SamplingParams const &) const;
+};
+} // namespace std
+
+#endif // _FLEXFLOW_SAMPLING_PARAMS_H
\ No newline at end of file
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 68a8e10042..17fc58c53a 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -38,6 +38,9 @@ void parse_input_args(char **argv,
                       ModelType &llm_model_type,
                       bool &use_full_precision,
                       bool &verbose,
+                      bool &do_sample,
+                      float &temperature,
+                      float &topp,
                       int &data_parallelism_degree,
                       int &tensor_parallelism_degree,
                       int &pipeline_parallelism_degree) {
@@ -109,6 +112,18 @@ void parse_input_args(char **argv,
       verbose = true;
       continue;
     }
+    if (!strcmp(argv[i], "--do-sample")) {
+      do_sample = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--temperature")) {
+      temperature = std::stof(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--topp")) {
+      topp = std::stof(argv[++i]);
+      continue;
+    }
   }
 }
 
@@ -124,6 +139,9 @@ void FlexFlow::top_level_task(Task const *task,
   ModelType model_type;
   bool use_full_precision = false;
   bool verbose = false;
+  bool do_sample = false;
+  float temperature = 0.0f;
+  float topp = 0.0f;
   size_t num_devices = ffconfig.workersPerNode * ffconfig.numNodes;
   int data_parallelism_degree = 1, tensor_parallelism_degree = 1,
       pipeline_parallelism_degree = 1;
@@ -137,12 +155,16 @@ void FlexFlow::top_level_task(Task const *task,
                    model_type,
                    use_full_precision,
                    verbose,
+                   do_sample,
+                   temperature,
+                   topp,
                    data_parallelism_degree,
                    tensor_parallelism_degree,
                    pipeline_parallelism_degree);
   ffconfig.data_parallelism_degree = data_parallelism_degree;
   ffconfig.tensor_parallelism_degree = tensor_parallelism_degree;
   ffconfig.pipeline_parallelism_degree = pipeline_parallelism_degree;
+
   assert(data_parallelism_degree * tensor_parallelism_degree *
              pipeline_parallelism_degree ==
          ffconfig.numNodes * ffconfig.workersPerNode);
@@ -150,6 +172,7 @@ void FlexFlow::top_level_task(Task const *task,
   assert(model_type != ModelType::UNKNOWN &&
          "Invalid LLM model type passed (or no type was passed).");
 
+  SamplingConfig samplingConfig(do_sample, temperature, topp);
   InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS);
   RequestManager rm(model_type,
                     file_paths.tokenizer_file_path,
@@ -163,6 +186,7 @@ void FlexFlow::top_level_task(Task const *task,
                               file_paths.llm_config_file_path,
                               file_paths.llm_weight_file_path,
                               INC_DECODING_MODE,
+                              samplingConfig,
                               use_full_precision);
   } else if (model_type == ModelType::OPT) {
     OPT::create_opt_model(model,
@@ -211,6 +235,7 @@ void FlexFlow::top_level_task(Task const *task,
     assert(fm.get_future_map_domain().get_volume() == 1);
     Future future = fm.get_future(0);
     ir = future.get_result<InferenceResult>();
+    // assert(false);
   }
 
   // Execution fence
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index e54ec13147..06dfaebcb1 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -24,6 +24,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                                std::string const &model_config_file_path,
                                std::string const &weight_file_path,
                                InferenceMode mode,
+                               SamplingConfig samplingConfig,
                                bool use_full_precision) {
   // do not apply cpu offload in beam search model.
   Config llama_config(model_config_file_path);
@@ -210,7 +211,14 @@ void LLAMA::create_llama_model(FFModel &ff,
     Tensor softmax = ff.softmax(dense, -1);
     output = ff.beam_top_k(softmax, llama_config.max_beam_width, false);
   } else {
-    output = ff.arg_top_k(dense, /*k=*/1, false);
+    // Tensor softmax = ff.softmax(dense, -1);
+    if (samplingConfig.do_sample) {
+      dense = ff.scalar_truediv(dense, samplingConfig.temperature, false);
+      Tensor softmax = ff.softmax(dense, -1);
+      output = ff.sampling(softmax, samplingConfig.topp);
+    } else {
+      output = ff.arg_top_k(dense, /*k=*/1, false);
+    }
   }
 
   // Compile the model
diff --git a/inference/models/llama.h b/inference/models/llama.h
index ab9bd4c7f3..6f80194d72 100644
--- a/inference/models/llama.h
+++ b/inference/models/llama.h
@@ -107,6 +107,7 @@ class LLAMA {
                                  std::string const &model_config_file_path,
                                  std::string const &weight_file_path,
                                  InferenceMode mode,
+                                 SamplingConfig samplingConfig,
                                  bool use_full_precision = false);
 };
 
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 9cdcb454a2..a4c3dc64f9 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -199,6 +199,7 @@ void FlexFlow::top_level_task(Task const *task,
   }
 
   // Create SentencePiece tokenizer or OPT tokenizer
+  SamplingConfig samplingConfig;
   InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS);
   RequestManager rm(model_types.llm_model_type,
                     file_paths.tokenizer_file_path,
@@ -213,6 +214,7 @@ void FlexFlow::top_level_task(Task const *task,
                               file_paths.llm_config_file_path,
                               file_paths.llm_weight_file_path,
                               TREE_VERIFY_MODE,
+                              samplingConfig,
                               use_full_precision);
   } else if (model_types.llm_model_type == ModelType::OPT) {
     OPT::create_opt_model(tree_model,
@@ -245,6 +247,7 @@ void FlexFlow::top_level_task(Task const *task,
                                 file_paths.ssm_config_file_paths[ssm_id],
                                 file_paths.ssm_weight_file_paths[ssm_id],
                                 BEAM_SEARCH_MODE,
+                                samplingConfig,
                                 use_full_precision);
     } else if (model_types.ssm_model_types[ssm_id] == ModelType::OPT) {
       OPT::create_opt_model(beam_model,
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 2f84100554..ef6c856871 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -748,7 +748,8 @@ __host__ void
       case OP_RELU:
       case OP_SIGMOID:
       case OP_TANH:
-      case OP_ELU: {
+      case OP_ELU:
+      case OP_SCALAR_TRUE_DIV: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
diff --git a/src/ops/sampling.cc b/src/ops/sampling.cc
new file mode 100644
index 0000000000..8c01464042
--- /dev/null
+++ b/src/ops/sampling.cc
@@ -0,0 +1,343 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/ops/sampling.h"
+#include "flexflow/model.h"
+#include "flexflow/utils/hash_utils.h"
+#include "legion/legion_utilities.h"
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+#include "flexflow/utils/cuda_helper.h"
+#else
+#include "flexflow/utils/hip_helper.h"
+#endif
+
+namespace FlexFlow {
+// declare Legion names
+using Legion::ArgumentMap;
+using Legion::Context;
+using Legion::coord_t;
+using Legion::Domain;
+using Legion::FutureMap;
+using Legion::IndexLauncher;
+using Legion::InlineLauncher;
+using Legion::Machine;
+using Legion::Memory;
+using Legion::PhysicalRegion;
+using Legion::Predicate;
+using Legion::Rect;
+using Legion::RegionRequirement;
+using Legion::Runtime;
+using Legion::Task;
+using Legion::TaskArgument;
+using Legion::TaskLauncher;
+using PCG::Node;
+
+// For an input tensor, computes the top k entries in each row
+// (resp. vector along the last dimension). Thus,
+// values.shape = indices.shape = input.shape[:-1] + [k]
+Tensor FFModel::sampling(const Tensor input, float top_p, char const *name) {
+  Layer *li = new Layer(this,
+                        OP_SAMPLING,
+                        input->data_type,
+                        name,
+                        1 /*inputs*/,
+                        0 /*weights*/,
+                        1 /*outputs*/,
+                        input);
+  {
+    int numdims = input->num_dims;
+    int dims[MAX_TENSOR_DIM];
+    for (int i = 0; i < numdims; i++) {
+      dims[i] = input->dims[i];
+    }
+    // now just support 1 output
+    dims[0] = 1;
+    // li->outputs[0] = create_tensor_legion_ordering(
+    //     numdims, dims, input->data_type, li, 0, true /*create_grad*/);
+    li->outputs[0] = create_tensor_legion_ordering(
+        numdims, dims, DT_INT32, li, 0, false /*create_grad*/);
+  }
+  layers.push_back(li);
+  li->add_float_property("top_p", top_p);
+  // outputs[0] = li->outputs[0];
+  // outputs[1] = li->outputs[1];
+  return li->outputs[0];
+}
+
+Op *Sampling::create_operator_from_layer(
+    FFModel &model,
+    Layer const *layer,
+    std::vector<ParallelTensor> const &inputs) {
+  float top_p;
+  layer->get_float_property("top_p", top_p);
+  return new Sampling(model, inputs[0], top_p, layer->name);
+}
+
+SamplingParams Sampling::get_params() const {
+  SamplingParams params;
+  params.top_p = this->top_p;
+  return params;
+}
+
+bool SamplingParams::is_valid(ParallelTensorShape const &) const {
+  return true;
+}
+
+bool operator==(SamplingParams const &lhs, SamplingParams const &rhs) {
+  return lhs.top_p == rhs.top_p;
+}
+
+Sampling::Sampling(FFModel &model,
+                   const ParallelTensor _input,
+                   float _top_p,
+                   char const *name)
+    : Op(model,
+         OP_SAMPLING,
+         _input->data_type,
+         name,
+         1 /*inputs*/,
+         0 /*weights*/,
+         1 /*outputs*/,
+         _input),
+      top_p(_top_p) {
+  int numdim = inputs[0]->num_dims;
+  ParallelDim dims[MAX_TENSOR_DIM];
+  for (int i = 0; i < numdim; i++) {
+    dims[i] = inputs[0]->dims[i];
+  }
+  dims[0].size = 1;
+  std::cout << "degree: " << inputs[0]->dims[0].degree << "\n";
+  assert(inputs[0]->dims[0].degree == 1);
+  assert(inputs[0]->dims[0].parallel_idx == -1);
+  //   outputs[0] = model.create_parallel_tensor_legion_ordering(
+  //       numdim, dims, _input->data_type, this, 0 /*owner_idx*/);
+  outputs[0] = model.create_parallel_tensor_legion_ordering(
+      numdim, dims, DT_INT32, this, 0 /*owner_idx*/);
+}
+
+Sampling::Sampling(FFModel &model,
+                   Sampling const &other,
+                   const ParallelTensor input)
+    : Sampling(model, input, other.top_p, other.name) {}
+
+Sampling::Sampling(FFModel &model,
+                   SamplingParams const &params,
+                   const ParallelTensor input,
+                   char const *name)
+    : Sampling(model, input, params.top_p, name) {}
+
+void Sampling::init_inference(FFModel const &ff,
+                              std::vector<ParallelTensor> const &batch_inputs,
+                              std::vector<ParallelTensor> const &batch_outputs,
+                              MachineView const *mv) {
+  assert(check_output_input_weight_same_parallel_is());
+  parallel_is = batch_outputs[0]->parallel_is;
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  size_t machine_view_hash = view->hash();
+  set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]);
+  IndexLauncher launcher(SAMPLING_INIT_TASK_ID,
+                         parallel_is,
+                         TaskArgument(this, sizeof(Sampling)),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    WRITE_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  FutureMap fm = runtime->execute_index_space(ctx, launcher);
+  fm.wait_all_results();
+  set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]);
+}
+
+void Sampling::init(FFModel const &ff) {
+  assert(check_output_input_weight_same_parallel_is());
+  parallel_is = outputs[0]->parallel_is;
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  set_argumentmap_for_init(ff, argmap);
+  IndexLauncher launcher(SAMPLING_INIT_TASK_ID,
+                         parallel_is,
+                         TaskArgument(this, sizeof(Sampling)),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         outputs[0]->machine_view.hash());
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    WRITE_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  FutureMap fm = runtime->execute_index_space(ctx, launcher);
+  fm.wait_all_results();
+  set_opmeta_from_futuremap(ff, fm);
+}
+
+OpMeta *Sampling::init_task(Task const *task,
+                            std::vector<PhysicalRegion> const &regions,
+                            Context ctx,
+                            Runtime *runtime) {
+  Sampling *s = (Sampling *)task->args;
+  FFHandler handle = *((FFHandler *)task->local_args);
+  GenericTensorAccessorW acc_input =
+      helperGetGenericTensorAccessorRW(s->inputs[0]->data_type,
+                                       regions[0],
+                                       task->regions[0],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+
+  int length = acc_input.domain.hi()[0] - acc_input.domain.lo()[0] + 1;
+  int batch_size = acc_input.domain.get_volume() / length;
+
+  SamplingMeta *m =
+      new SamplingMeta(handle, s, batch_size, length * batch_size, acc_input);
+  m->profiling = s->profiling;
+  m->top_p = s->top_p;
+  return m;
+}
+
+void Sampling::forward(FFModel const &ff) {
+  // Sampling does not support forward
+  assert(false);
+}
+
+FutureMap Sampling::inference(FFModel const &ff,
+                              BatchConfig const &bc,
+                              std::vector<ParallelTensor> const &batch_inputs,
+                              std::vector<ParallelTensor> const &batch_outputs,
+                              MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  /* std::cout << "Sampling op machine_view: " << *(MachineView const *)mv
+            << std::endl; */
+  IndexLauncher launcher(SAMPLING_INF_TASK_ID,
+                         parallel_is,
+                         TaskArgument(&bc, sizeof(BatchConfig)),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    WRITE_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+InferenceResult
+    Sampling::inference_task(Task const *task,
+                             std::vector<PhysicalRegion> const &regions,
+                             Context ctx,
+                             Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+  BatchConfig const *bc = (BatchConfig *)task->args;
+  SamplingMeta const *m = *((SamplingMeta **)task->local_args);
+
+  GenericTensorAccessorW input = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO(
+      DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+  int batch_size = bc->num_active_tokens();
+  Sampling::forward_kernel_wrapper(m, input, indices, batch_size);
+
+  InferenceResult ir;
+  download_tensor<BatchConfig::TokenId>(
+      indices.get_int32_ptr(), ir.token_ids, batch_size);
+  return ir;
+}
+
+void Sampling::backward(FFModel const &ff) {
+  // Sampling does not support backward
+  assert(false);
+}
+
+void Sampling::serialize(Legion::Serializer &sez) const {
+  sez.serialize(this->top_p);
+}
+
+Node Sampling::deserialize(FFModel &ff,
+                           Legion::Deserializer &dez,
+                           ParallelTensor inputs[],
+                           int num_inputs) {
+  assert(num_inputs == 1);
+  float top_p;
+  dez.deserialize(top_p);
+  SamplingParams params;
+  params.top_p = top_p;
+  return ff.get_or_create_node<Sampling>(inputs[0], params);
+}
+
+Op *Sampling::materialize(FFModel &ff,
+                          ParallelTensor inputs[],
+                          int num_inputs) const {
+  SamplingParams params = get_params();
+  return new Sampling(ff, params, inputs[0], this->name);
+}
+
+bool Sampling::measure_operator_cost(Simulator *sim,
+                                     MachineView const &mv,
+                                     CostMetrics &cost_metrics) const {
+  return false;
+}
+
+}; // namespace FlexFlow
+
+namespace std {
+size_t hash<FlexFlow::SamplingParams>::operator()(
+    FlexFlow::SamplingParams const &params) const {
+  size_t key = 0;
+  hash_combine(key, params.top_p);
+  return key;
+}
+}; // namespace std
\ No newline at end of file
diff --git a/src/ops/sampling.cpp b/src/ops/sampling.cpp
new file mode 100644
index 0000000000..4901fe400c
--- /dev/null
+++ b/src/ops/sampling.cpp
@@ -0,0 +1,67 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/ops/sampling.h"
+#include "flexflow/ffconst_utils.h"
+#include "flexflow/utils/hip_helper.h"
+#include <hip/hip_runtime.h>
+
+namespace FlexFlow {
+
+/*static*/
+template <typename DT>
+void Sampling::forward_kernel(SamplingMeta const *m,
+                              DT *input_ptr,
+                              int *indices_ptr,
+                              float const top_p,
+                              int const length,
+                              int const batch_size,
+                              hipStream_t stream) {}
+
+/*static*/
+void Sampling::forward_kernel_wrapper(SamplingMeta const *m,
+                                      GenericTensorAccessorW const &input,
+                                      GenericTensorAccessorW const &indices,
+                                      int batch_size) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    hipEventCreate(&t_start);
+    hipEventCreate(&t_end);
+    hipEventRecord(t_start, stream);
+  }
+
+  handle_unimplemented_hip_kernel(OP_RMS_NORM);
+
+  if (m->profiling) {
+    hipEventRecord(t_end, stream);
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    hipEventDestroy(t_start);
+    hipEventDestroy(t_end);
+  }
+}
+
+SamplingMeta::SamplingMeta(FFHandler handler,
+                           Op const *op,
+                           int batch_size,
+                           int total_ele,
+                           GenericTensorAccessorW input)
+    : OpMeta(handler, op) {}
+
+}; // namespace FlexFlow
\ No newline at end of file
diff --git a/src/ops/sampling.cu b/src/ops/sampling.cu
new file mode 100644
index 0000000000..a91263a621
--- /dev/null
+++ b/src/ops/sampling.cu
@@ -0,0 +1,267 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cub/cub.cuh"
+#include "flexflow/ffconst_utils.h"
+#include "flexflow/ops/sampling.h"
+#include "flexflow/utils/cuda_helper.h"
+#include <curand.h>
+#include <curand_kernel.h>
+
+namespace FlexFlow {
+
+constexpr int SamplingNumThreads = 1024;
+struct BlockPrefixCallbackOp {
+  // Running prefix
+  float running_total;
+  // Constructor
+  __device__ BlockPrefixCallbackOp(float running_total)
+      : running_total(running_total) {}
+  // Callback operator to be entered by the first warp of threads in the block.
+  // Thread-0 is responsible for returning a value for seeding the block-wide
+  // scan.
+  __device__ float operator()(float block_aggregate) {
+    float old_prefix = running_total;
+    running_total += block_aggregate;
+    return old_prefix;
+  }
+};
+
+__global__ void init_idxs(int batch_size,
+                          int vocab_size,
+                          int total_eles,
+                          int *idx,
+                          int *begin_offset,
+                          int *end_offset) {
+  CUDA_KERNEL_LOOP(i, total_eles) {
+    idx[i] = i % vocab_size;
+    if (i % vocab_size == 0) {
+      begin_offset[i / vocab_size] = i;
+      end_offset[i / vocab_size] = i;
+    }
+  }
+}
+
+__global__ void
+    init_random_kernel(curandState *state, int batch_size, long rand) {
+  CUDA_KERNEL_LOOP(i, batch_size) {
+    curand_init(rand, i, 0, &state[i]);
+  }
+}
+
+// multinominal and gather
+template <typename DT, int BLOCK_SIZE>
+__global__ void sampling_topp_kernel(int batch_size,
+                                     int const vocab_size,
+                                     curandState *state,
+                                     DT *sorted_logits,
+                                     int *sorted_idx,
+                                     int *indices_ptr,
+                                     float topp) {
+  // int const vocab_id = threadIdx.x;
+  int const batch_idx = blockIdx.x;
+  __shared__ float random_n;
+  __shared__ long long result_idx;
+
+  // random num
+  if (threadIdx.x == 0) {
+    // number must < topp
+    random_n = curand_uniform(state + batch_idx) * topp;
+    // printf("batch idx: %d, random num%f\n", batch_idx, random_n);
+  }
+
+  __syncthreads();
+
+  // cumsum;
+  typedef cub::BlockScan<float, BLOCK_SIZE> BlockScan;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+
+  int offset = batch_idx * vocab_size;
+  float prefix_sum = 0.0f;
+  BlockPrefixCallbackOp prefix_op(0);
+  result_idx = vocab_size - 1;
+
+  for (long long j = threadIdx.x; j < vocab_size; j += blockDim.x) {
+    float logit = (float)(sorted_logits[offset + j]);
+    BlockScan(temp_storage).InclusiveSum(logit, prefix_sum, prefix_op);
+    prefix_sum /= topp;
+    if (prefix_sum >= random_n) {
+      atomicMin(&result_idx, j);
+    }
+  }
+  indices_ptr[batch_idx] = sorted_idx[offset + result_idx];
+
+  // if (threadIdx.x == 0) {
+  //   printf("selected idx: %d, %d\n", blockIdx.x, result_idx);
+  // }
+}
+
+/*static*/
+template <typename DT>
+void Sampling::forward_kernel(SamplingMeta const *m,
+                              DT *input_ptr,
+                              int *indices_ptr,
+                              float const top_p,
+                              int const length,
+                              int const batch_size,
+                              cudaStream_t stream) {
+  // 1. sort
+  size_t temp_storage_bytes = m->temp_storage_bytes;
+  checkCUDA(cub::DeviceSegmentedRadixSort::SortPairsDescending(
+      m->d_temp_storage,
+      temp_storage_bytes,
+      input_ptr,
+      static_cast<DT *>(m->sorted_logits),
+      m->idx,
+      m->sorted_idx,
+      length * batch_size,
+      batch_size,
+      m->begin_offset,
+      m->end_offset + 1,
+      0,              // begin_bit
+      sizeof(DT) * 8, // end_bit = sizeof(KeyT) * 8
+      stream));
+  int parallelism = batch_size;
+  init_random_kernel<<<GET_BLOCKS(parallelism),
+                       min(CUDA_NUM_THREADS, parallelism),
+                       0,
+                       stream>>>(m->state, batch_size, rand());
+  // sampling
+  sampling_topp_kernel<DT, SamplingNumThreads>
+      <<<batch_size, SamplingNumThreads, 0, stream>>>(
+          batch_size,
+          length,
+          m->state,
+          static_cast<DT *>(m->sorted_logits),
+          m->sorted_idx,
+          indices_ptr,
+          top_p);
+}
+
+/*static*/
+void Sampling::forward_kernel_wrapper(SamplingMeta const *m,
+                                      GenericTensorAccessorW const &input,
+                                      GenericTensorAccessorW const &indices,
+                                      int batch_size) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+  int length = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+
+  if (input.data_type == DT_HALF) {
+    Sampling::forward_kernel<half>(m,
+                                   input.get_half_ptr(),
+                                   indices.get_int32_ptr(),
+                                   m->top_p,
+                                   length,
+                                   batch_size,
+                                   stream);
+  } else if (input.data_type == DT_FLOAT) {
+    Sampling::forward_kernel<float>(m,
+                                    input.get_float_ptr(),
+                                    indices.get_int32_ptr(),
+                                    m->top_p,
+                                    length,
+                                    batch_size,
+                                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[Sampling] forward time = %.2lfms\n", elapsed);
+  }
+}
+
+SamplingMeta::SamplingMeta(FFHandler handler,
+                           Op const *op,
+                           int batch_size,
+                           int total_ele,
+                           GenericTensorAccessorW input)
+    : OpMeta(handler, op) {
+  DataType data_type = op->data_type;
+  checkCUDA(cudaMalloc(&begin_offset, (batch_size + 1) * sizeof(int)));
+  checkCUDA(cudaMalloc(&end_offset, (batch_size + 1) * sizeof(int)));
+  checkCUDA(cudaMalloc(&idx, total_ele * sizeof(int)));
+
+  checkCUDA(cudaMalloc(&sorted_idx, total_ele * sizeof(int)));
+  checkCUDA(cudaMalloc(&sorted_logits, total_ele * data_type_size(data_type)));
+  cudaMalloc(&state, sizeof(curandState) * batch_size);
+
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  // init offset
+  int parallelism = total_ele;
+  init_idxs<<<GET_BLOCKS(parallelism),
+              min(CUDA_NUM_THREADS, parallelism),
+              0,
+              stream>>>(batch_size,
+                        total_ele / batch_size,
+                        total_ele,
+                        idx,
+                        begin_offset,
+                        end_offset);
+
+  // init sort function
+  if (data_type == DT_FLOAT) {
+    checkCUDA(cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        d_temp_storage,
+        temp_storage_bytes,
+        input.get_float_ptr(),
+        input.get_float_ptr(),
+        idx,
+        idx,
+        total_ele,
+        batch_size,
+        begin_offset,
+        end_offset + 1,
+        0,                             // begin_bit
+        data_type_size(data_type) * 8, // end_bit = sizeof(KeyT) * 8
+        stream));
+  } else if (data_type == DT_HALF) {
+    checkCUDA(cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        d_temp_storage,
+        temp_storage_bytes,
+        input.get_half_ptr(),
+        input.get_half_ptr(),
+        idx,
+        idx,
+        total_ele,
+        batch_size,
+        begin_offset,
+        end_offset + 1,
+        0,                             // begin_bit
+        data_type_size(data_type) * 8, // end_bit = sizeof(KeyT) * 8
+        stream));
+  } else {
+    assert(false && "input type in float and half");
+  }
+  checkCUDA(cudaMalloc(&d_temp_storage, temp_storage_bytes));
+}
+
+}; // namespace FlexFlow
\ No newline at end of file
diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc
index 39f9d1dd0d..a777605daf 100644
--- a/src/runtime/ffconst_utils.cc
+++ b/src/runtime/ffconst_utils.cc
@@ -178,6 +178,8 @@ std::string get_operator_type_name(OperatorType type) {
       return "GELU";
     case OP_IDENTITY:
       return "Identity";
+    case OP_SAMPLING:
+      return "Sampling";
     // Parallel Ops
     case OP_REPARTITION:
       return "Repartition";
diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc
index 5c0513baa8..16bccc25df 100644
--- a/src/runtime/graph.cc
+++ b/src/runtime/graph.cc
@@ -40,6 +40,7 @@
 #include "flexflow/ops/reduce.h"
 #include "flexflow/ops/reshape.h"
 #include "flexflow/ops/rms_norm.h"
+#include "flexflow/ops/sampling.h"
 #include "flexflow/ops/softmax.h"
 #include "flexflow/ops/spec_inc_multihead_self_attention.h"
 #include "flexflow/ops/split.h"
@@ -2919,6 +2920,10 @@ void FFModel::deserialize_graph_optimal_view(
         node = BeamTopK::deserialize(*this, dez, inputs, num_inputs);
         break;
       }
+      case OP_SAMPLING: {
+        node = Sampling::deserialize(*this, dez, inputs, num_inputs);
+        break;
+      }
       case OP_GROUP_BY: {
         node = Group_by::deserialize(*this, dez, inputs, num_inputs);
         break;
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 5179178cd9..22515a2bb0 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -52,6 +52,7 @@
 #include "flexflow/ops/reshape.h"
 #include "flexflow/ops/reverse.h"
 #include "flexflow/ops/rms_norm.h"
+#include "flexflow/ops/sampling.h"
 #include "flexflow/ops/softmax.h"
 #include "flexflow/ops/spec_inc_multihead_self_attention.h"
 #include "flexflow/ops/split.h"
@@ -2937,6 +2938,11 @@ Op *FFModel::create_operator_from_layer(
       operators.push_back(op);
       return op;
     }
+    case OP_SAMPLING: {
+      Op *op = Sampling::create_operator_from_layer(*this, layer, inputs);
+      operators.push_back(op);
+      return op;
+    }
     case OP_GROUP_BY: {
       Op *op = Group_by::create_operator_from_layer(*this, layer, inputs);
       operators.push_back(op);
@@ -2977,7 +2983,8 @@ void FFModel::create_operators_from_layers() {
     Op *op = nullptr;
     // add a combine before arg_topk
     if (config.computationMode == COMP_MODE_INFERENCE &&
-        config.tensor_parallelism_degree > 1 && l->op_type == OP_ARG_TOPK) {
+        config.tensor_parallelism_degree > 1 &&
+        (l->op_type == OP_ARG_TOPK || l->op_type == OP_SOFTMAX)) {
       std::vector<ParallelTensor> partitioned_inputs;
       assert(inputs.size() == 1);
       Combine *comb = new Combine(*this,
@@ -5406,6 +5413,37 @@ void register_flexflow_internal_tasks(Runtime *runtime,
                                      BeamTopK::inference_task>(registrar);
     }
   }
+  // Sampling task
+  {
+    TaskVariantRegistrar registrar(SAMPLING_INIT_TASK_ID, "Sampling Init");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<OpMeta *, Sampling::init_task>(
+          registrar, "Sampling Init Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<OpMeta *, Sampling::init_task>(registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(SAMPLING_INF_TASK_ID, "Sampling Inference");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<InferenceResult,
+                                        Sampling::inference_task>(
+          registrar, "Sampling Inference Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<InferenceResult, Sampling::inference_task>(
+          registrar);
+    }
+  }
   // Transpose task
   {
     TaskVariantRegistrar registrar(TRANSPOSE_INIT_TASK_ID, "Transpose Init");
diff --git a/src/runtime/operator_params.cc b/src/runtime/operator_params.cc
index 6b61d5ac7a..8fb8c89b10 100644
--- a/src/runtime/operator_params.cc
+++ b/src/runtime/operator_params.cc
@@ -28,6 +28,7 @@
 #include "flexflow/ops/reshape.h"
 #include "flexflow/ops/reverse.h"
 #include "flexflow/ops/rms_norm.h"
+#include "flexflow/ops/sampling.h"
 #include "flexflow/ops/softmax.h"
 #include "flexflow/ops/spec_inc_multihead_self_attention.h"
 #include "flexflow/ops/split.h"
@@ -130,6 +131,8 @@ tl::optional<OperatorParameters> get_op_parameters(Op const *op) {
       return ((ArgTopK *)op)->get_params();
     case OP_BEAM_TOPK:
       return ((BeamTopK *)op)->get_params();
+    case OP_SAMPLING:
+      return ((Sampling *)op)->get_params();
 
       // TODO: implement the get_params() function for the operators below and
       // uncomment the lines below

From 8b18d11add774de45351a1491c924be14ca3ee45 Mon Sep 17 00:00:00 2001
From: DerrickYLJ <99985904+DerrickYLJ@users.noreply.github.com>
Date: Wed, 19 Jul 2023 15:22:59 -0400
Subject: [PATCH 3/4] Update Docker workflow and README (#867)

* docker_build_skip update
Co-authored-by: Gabriele Oliaro <goliaro@cs.cmu.edu>

**Description of changes:**
Update `docker-build-skip.yml` file to match with the `docker-build` in current `master` branch.

**Related Issues:**

Linked Issues:
- Issue #

Issues closed by this PR:
- Closes #

**Before merging:**

- [ ] Did you update the [flexflow-third-party](https://github.com/flexflow/flexflow-third-party) repo, if modifying any of the Cmake files, the build configs, or the submodules?

* save cuda tests

* fixed

* bug fix

* bug fix

* moved envs and simplified if statements

* update readme

---------

Co-authored-by: Gabriele Oliaro <goliaro@cs.cmu.edu>
---
 .github/workflows/docker-build-skip.yml | 15 +++++++
 .github/workflows/docker-build.yml      | 54 +++++++++++++++++--------
 README.md                               |  2 +-
 docker/README.md                        |  5 +--
 4 files changed, 55 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/docker-build-skip.yml b/.github/workflows/docker-build-skip.yml
index a09979283f..59b584c6c4 100644
--- a/.github/workflows/docker-build-skip.yml
+++ b/.github/workflows/docker-build-skip.yml
@@ -19,6 +19,21 @@ jobs:
     strategy:
       matrix:
         gpu_backend: ["cuda", "hip_rocm"]
+        cuda_version: ["11.1", "11.2", "11.3", "11.5", "11.6", "11.7", "11.8"]
+        # The CUDA version doesn't matter when building for hip_rocm, so we just pick one arbitrarily (11.8) to avoid building for hip_rocm once per number of CUDA version supported
+        exclude:
+          - gpu_backend: "hip_rocm"
+            cuda_version: "11.1"
+          - gpu_backend: "hip_rocm"
+            cuda_version: "11.2"
+          - gpu_backend: "hip_rocm"
+            cuda_version: "11.3"
+          - gpu_backend: "hip_rocm"
+            cuda_version: "11.5"
+          - gpu_backend: "hip_rocm"
+            cuda_version: "11.6"
+          - gpu_backend: "hip_rocm"
+            cuda_version: "11.7"
       fail-fast: false
     steps:
       - run: 'echo "No docker-build required"'
diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
index 40c86c1600..280539eb5f 100644
--- a/.github/workflows/docker-build.yml
+++ b/.github/workflows/docker-build.yml
@@ -25,8 +25,11 @@ jobs:
     strategy:
       matrix:
         gpu_backend: ["cuda", "hip_rocm"]
-        cuda_version: ["11.1", "11.2", "11.3", "11.5", "11.6","11.7", "11.8"]
+        cuda_version: ["11.1", "11.2", "11.3", "11.5", "11.6", "11.7", "11.8"]
+        # The CUDA version doesn't matter when building for hip_rocm, so we just pick one arbitrarily (11.8) to avoid building for hip_rocm once per number of CUDA version supported
         exclude:
+          - gpu_backend: "hip_rocm"
+            cuda_version: "11.1"
           - gpu_backend: "hip_rocm"
             cuda_version: "11.2"
           - gpu_backend: "hip_rocm"
@@ -37,10 +40,11 @@ jobs:
             cuda_version: "11.6"
           - gpu_backend: "hip_rocm"
             cuda_version: "11.7"
-          - gpu_backend: "hip_rocm"
-            cuda_version: "11.8"
-        
       fail-fast: false
+    env:
+      FF_GPU_BACKEND: ${{ matrix.gpu_backend }}
+      cuda_version: ${{ matrix.cuda_version }}
+      branch_name: ${GITHUB_REF#refs/heads/}
     steps:
       - name: Checkout Git Repository
         uses: actions/checkout@v3
@@ -48,43 +52,59 @@ jobs:
           submodules: recursive
 
       - name: Free additional space on runner
-        run: .github/workflows/helpers/free_space_on_runner.sh
+        env:
+          deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }}
+          build_needed: ${{ matrix.gpu_backend == 'hip_rocm' || ( matrix.gpu_backend == 'cuda' && matrix.cuda_version == '11.8' ) }}
+        run: |
+          if [[ $deploy_needed == "true" || $build_needed == "true" ]]; then
+            .github/workflows/helpers/free_space_on_runner.sh
+          else
+            echo "Skipping this step to save time"
+          fi
 
       - name: Build Docker container
         env:
-          FF_GPU_BACKEND: ${{ matrix.gpu_backend }}
-          cuda_version: ${{ matrix.cuda_version }}
+          deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }}
+          build_needed: ${{ matrix.gpu_backend == 'hip_rocm' || ( matrix.gpu_backend == 'cuda' && matrix.cuda_version == '11.8' ) }}
         run: |
           # On push to inference, build for all compatible architectures, so that we can publish 
           # a pre-built general-purpose image. On all other cases, only build for one architecture
           # to save time.
-          if [[ ( ${{ github.event_name }} == 'push' || ${{ github.event_name }} == 'schedule' ) && ${GITHUB_REF#refs/heads/} == "inference" ]]; then
+          if [[ $deploy_needed == "true" ]] ; then
             export FF_CUDA_ARCH=all
-          else
+            ./docker/build.sh flexflow
+          elif [[ $build_needed == "true" ]]; then
             export FF_CUDA_ARCH=70
+            ./docker/build.sh flexflow
+          else
+            echo "Skipping build to save time"
           fi
-          ./docker/build.sh flexflow
 
       - name: Check availability of Python flexflow.core module
         if: ${{ matrix.gpu_backend == 'cuda' }}
-        env: 
-          cuda_version: ${{ matrix.cuda_version }}
-        run: docker run --env CPU_ONLY_TEST=1 --entrypoint /bin/bash flexflow-cuda-${cuda_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; exit()'"
+        env:
+          deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }}
+          build_needed: ${{ matrix.gpu_backend == 'hip_rocm' || ( matrix.gpu_backend == 'cuda' && matrix.cuda_version == '11.8' ) }}
+        run: |
+          if [[ $deploy_needed == "true" || $build_needed == "true" ]]; then
+            docker run --env CPU_ONLY_TEST=1 --entrypoint /bin/bash flexflow-cuda-${cuda_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; exit()'"
+          else
+            echo "Skipping test to save time"
+          fi
 
       - name: Publish Docker environment image (on push to inference)
         if: github.repository_owner == 'flexflow'
         env:
           FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }}
-          FF_GPU_BACKEND: ${{ matrix.gpu_backend }}
-          cuda_version: ${{ matrix.cuda_version }}
+          deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }}
         run: |
-          if [[ ( ${{ github.event_name }} == 'push' || ${{ github.event_name }} == 'schedule' ) && ${GITHUB_REF#refs/heads/} == "inference" ]]; then
+          if [[ $deploy_needed == "true" ]]; then
             ./docker/publish.sh flexflow-environment
             ./docker/publish.sh flexflow
           else
             echo "No need to update Docker containers in ghrc.io registry at this time."
           fi
-  
+
   notify-slack:
     name: Notify Slack in case of failure
     runs-on: ubuntu-20.04
diff --git a/README.md b/README.md
index c26904749d..9ad900fb3c 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 FlexFlow is a deep learning framework that accelerates distributed DNN training by automatically searching for efficient parallelization strategies. FlexFlow provides a drop-in replacement for PyTorch and TensorFlow Keras. Running existing PyTorch and Keras programs in FlexFlow only requires [a few lines of changes to the program](https://flexflow.ai/keras).
 
 ## Install FlexFlow
-To install FlexFlow from source code, please read the [instructions](https://flexflow.readthedocs.io/en/latest/installation.html). If you would like to quickly try FlexFlow, we also provide pre-built Docker packages ([flexflow-cuda](https://github.com/flexflow/FlexFlow/pkgs/container/flexflow-cuda) with a CUDA backend, [flexflow-hip_rocm](https://github.com/flexflow/FlexFlow/pkgs/container/flexflow-hip_rocm) with a HIP-ROCM backend) with all dependencies pre-installed (N.B.: currently, the CUDA pre-built containers are only fully compatible with host machines that have CUDA 11.7 installed), together with [Dockerfiles](./docker) if you wish to build the containers manually. You can also use `conda` to install the FlexFlow Python package (coming soon).
+To install FlexFlow from source code, please read the [instructions](https://flexflow.readthedocs.io/en/latest/installation.html). If you would like to quickly try FlexFlow, we also provide pre-built Docker packages for several versions of CUDA and for the `hip_rocm` backend, together with [Dockerfiles](./docker) if you wish to build the containers manually. More info on the Docker images can be found [here](./docker/README.md). You can also use `conda` to install the FlexFlow Python package (coming soon).
 
 ## PyTorch Support
 Users can also use FlexFlow to optimize the parallelization performance of existing PyTorch models in two steps. First, a PyTorch model can be exported to the FlexFlow model format using `flexflow.torch.fx.torch_to_flexflow`.
diff --git a/docker/README.md b/docker/README.md
index 4c52a66b6a..916b78acf6 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -7,9 +7,8 @@ You will need a machine with a NVIDIA GPU, with drivers installed. You will also
 ## Downloading a pre-built package
 The fastest way to run FlexFlow is to use one of the pre-built containers, which we update for each commit to the `inference` branch (the `inference` branch is currently ahead of the `master` branch). The available containers are the following, and can be found [at this link](https://github.com/orgs/flexflow/packages?repo_name=FlexFlow):
 
-* [flexflow-cuda](https://github.com/orgs/flexflow/packages/container/package/flexflow-cuda): the pre-built version of FlexFlow targeting GPUs with a CUDA backend, for CUDA versions 11.1, 11.3, 11.7, 11.2, 11.5, 11.6, and 11.8.
-* [flexflow-hip_rocm](https://github.com/orgs/flexflow/packages/container/package/flexflow-hip_rocm): the pre-built version of FlexFlow targeting GPUs with a HIP-ROCM backend.
-* [flexflow-environment-cuda](https://github.com/orgs/flexflow/packages/container/package/flexflow-environment-cuda) and [flexflow-environment-hip_rocm](https://github.com/orgs/flexflow/packages/container/package/flexflow-environment-hip_rocm): these are the base layers for `flexflow-cuda` and `flexflow-hip_rocm`. The packages are used in CI or for internal use, and contain all the dependencies needed to build/run Flexflow.
+* `flexflow`: the pre-built version of FlexFlow. We currently publish one version targeting GPUs with a `hip_rocm` backend (`flexflow-hip_rocm`), and several versions for CUDA GPUs (one for each of the following CUDA versions 11.1, 11.2, 11.3, 11.5, 11.6, 11.7, and 11.8). The CUDA images are named `flexflow-cuda-<CUDA version>`, e.g. [flexflow-cuda-11.8](https://github.com/orgs/flexflow/packages/container/package/flexflow-cuda-11.8)
+* `flexflow-environment`: this is the base layer for `flexflow`. The packages are used in CI or for internal use, and contain all the dependencies needed to build/run Flexflow. You may find them useful if you want to build FlexFlow yourself. We also publish one version of `flexflow-environment` for `hip_rocm` and one for each CUDA version in the list above. The naming convention is similar, too. For example, the `flexflow-environment` image for CUDA 11.8 is tagged [flexflow-environment-cuda-11.8](https://github.com/orgs/flexflow/packages/container/package/flexflow-environment-cuda-11.8).
 
 The easiest way to download any of the Docker containers above is to call:
 

From 3e23dd8444af7ff24120a0cdc2e95a4afec592d5 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 19 Jul 2023 16:07:26 -0400
Subject: [PATCH 4/4] fix

---
 .github/workflows/docker-build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
index 280539eb5f..d059a0605f 100644
--- a/.github/workflows/docker-build.yml
+++ b/.github/workflows/docker-build.yml
@@ -44,7 +44,7 @@ jobs:
     env:
       FF_GPU_BACKEND: ${{ matrix.gpu_backend }}
       cuda_version: ${{ matrix.cuda_version }}
-      branch_name: ${GITHUB_REF#refs/heads/}
+      branch_name: ${{ github.head_ref || github.ref_name }}
     steps:
       - name: Checkout Git Repository
         uses: actions/checkout@v3