diff --git a/.dockerignore b/.dockerignore
index 22ec965249..b9f228c009 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -11,3 +11,11 @@ python/flexflow/core/legion_cffi_header.py
*.pb.h
*.o
*.a
+
+# Ignore inference assets
+/inference/weights/*
+/inference/tokenizer/*
+/inference/prompt/*
+/inference/output/*
+
+/tests/inference/python_test_configs/*.json
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 183028b022..e8177cd9b7 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -10,6 +10,3 @@ Linked Issues:
Issues closed by this PR:
- Closes #
-**Before merging:**
-
-- [ ] Did you update the [flexflow-third-party](https://github.com/flexflow/flexflow-third-party) repo, if modifying any of the Cmake files, the build configs, or the submodules?
diff --git a/.github/README.md b/.github/README.md
new file mode 100644
index 0000000000..5aba2295d5
--- /dev/null
+++ b/.github/README.md
@@ -0,0 +1,255 @@
+# FlexFlow Serve: Low-Latency, High-Performance LLM Serving
+![build](https://github.com/flexflow/flexflow/workflows/build/badge.svg?branch=inference) ![gpu tests](https://github.com/flexflow/flexflow/workflows/gpu-ci/badge.svg?branch=inference) ![multinode gpu tests](https://github.com/flexflow/flexflow/workflows/multinode-test/badge.svg?branch=master) ![docker](https://github.com/flexflow/flexflow/workflows/docker-build/badge.svg?branch=inference) ![pip](https://github.com/flexflow/flexflow/workflows/pip-install/badge.svg?branch=inference) ![shell-check](https://github.com/flexflow/flexflow/workflows/Shell%20Check/badge.svg?branch=inference) ![clang-format](https://github.com/flexflow/flexflow/workflows/clang-format%20Check/badge.svg?branch=inference) [![Documentation Status](https://readthedocs.org/projects/flexflow/badge/?version=latest)](https://flexflow.readthedocs.io/en/latest/?badge=latest)
+
+
+---
+
+## What is FlexFlow Serve
+
+The high computational and memory requirements of generative large language
+models (LLMs) make it challenging to serve them quickly and cheaply.
+FlexFlow Serve is an open-source compiler and distributed system for
+__low latency__, __high performance__ LLM serving. FlexFlow Serve outperforms
+existing systems by 1.3-2.0x for single-node, multi-GPU inference and by
+1.4-2.4x for multi-node, multi-GPU inference.
+
+
+
+
+
+
+## Install FlexFlow Serve
+
+
+### Requirements
+* OS: Linux
+* GPU backend: Hip-ROCm or CUDA
+ * CUDA version: 10.2 – 12.0
+ * NVIDIA compute capability: 6.0 or higher
+* Python: 3.6 or higher
+* Package dependencies: [see here](https://github.com/flexflow/FlexFlow/blob/inference/requirements.txt)
+
+### Install with pip
+You can install FlexFlow Serve using pip:
+
+```bash
+pip install flexflow
+```
+
+### Try it in Docker
+If you run into any issue during the install, or if you would like to use the C++ API without needing to install from source, you can also use our pre-built Docker package for different CUDA versions (NVIDIA backend) and multiple ROCM versions (AMD backend). To download and run our pre-built Docker container:
+
+```bash
+docker run --gpus all -it --rm --shm-size=8g ghcr.io/flexflow/flexflow-cuda-12.0:latest
+```
+
+To download a Docker container for a backend other than CUDA v12.0, you can replace the `cuda-12.0` suffix with any of the following backends: `cuda-11.1`, `cuda-11.2`, `cuda-11.3`, `cuda-11.4`, `cuda-11.5`, `cuda-11.6`, `cuda-11.7`, `cuda-11.8`, and `hip_rocm-5.3`, `hip_rocm-5.4`, `hip_rocm-5.5`, `hip_rocm-5.6`). More info on the Docker images, with instructions to build a new image from source, or run with additional configurations, can be found [here](../docker/README.md).
+
+### Build from source
+
+You can install FlexFlow Serve from source code by building the inference branch of FlexFlow. Please follow these [instructions](https://flexflow.readthedocs.io/en/latest/installation.html).
+
+## Quickstart
+The following example shows how to deploy an LLM using FlexFlow Serve and accelerate its serving using [speculative inference](#speculative-inference). First, we import `flexflow.serve` and initialize the FlexFlow Serve runtime. Note that `memory_per_gpu` and `zero_copy_memory_per_node` specify the size of device memory on each GPU (in MB) and zero-copy memory on each node (in MB), respectively.
+We need to make sure the aggregated GPU memory and zero-copy memory are **both** sufficient to store LLM parameters in non-offloading serving. FlexFlow Serve combines tensor and pipeline model parallelism for LLM serving.
+```python
+import flexflow.serve as ff
+
+ff.init(
+ num_gpus=4,
+ memory_per_gpu=14000,
+ zero_copy_memory_per_node=30000,
+ tensor_parallelism_degree=4,
+ pipeline_parallelism_degree=1
+ )
+```
+Second, we specify the LLM to serve and the SSM(s) used to accelerate LLM serving. The list of supported LLMs and SSMs is available at [supported models](#supported-llms-and-ssms).
+```python
+# Specify the LLM
+llm = ff.LLM("meta-llama/Llama-2-7b-hf")
+
+# Specify a list of SSMs (just one in this case)
+ssms=[]
+ssm = ff.SSM("JackFram/llama-68m")
+ssms.append(ssm)
+```
+Next, we declare the generation configuration and compile both the LLM and SSMs. Note that all SSMs should run in the **beam search** mode, and the LLM should run in the **tree verification** mode to verify the speculated tokens from SSMs. You can also use the following arguments to specify serving configuration when compiling LLMs and SSMs:
+
+* max\_requests\_per\_batch: the maximum number of requests to serve in a batch (default: 16)
+* max\_seq\_length: the maximum number of tokens in a request (default: 256)
+* max\_tokens\_per\_batch: the maximum number of tokens to process in a batch (default: 128)
+
+```python
+# Create the sampling configs
+generation_config = ff.GenerationConfig(
+ do_sample=False, temperature=0.9, topp=0.8, topk=1
+)
+
+# Compile the SSMs for inference and load the weights into memory
+for ssm in ssms:
+ ssm.compile(generation_config)
+
+# Compile the LLM for inference and load the weights into memory
+llm.compile(generation_config,
+ max_requests_per_batch = 16,
+ max_seq_length = 256,
+ max_tokens_per_batch = 128,
+ ssms=ssms)
+```
+Next, we call `llm.start_server()` to start an LLM server running on a seperate background thread, which allows users to perform computations in parallel with LLM serving. Finally, we call `llm.generate` to generate the output, which is organized as a list of `GenerationResult`, which include the output tokens and text. After all serving requests are processed, you can either call `llm.stop_server()` to terminate the background thread or directly exit the python program, which will automatically terminate the background server thread.
+```python
+llm.start_server()
+result = llm.generate("Here are some travel tips for Tokyo:\n")
+llm.stop_server() # This invocation is optional
+```
+
+### Incremental decoding
+
+Expand here
+
+
+```python
+import flexflow.serve as ff
+
+# Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs
+ff.init(
+ num_gpus=4,
+ memory_per_gpu=14000,
+ zero_copy_memory_per_node=30000,
+ tensor_parallelism_degree=4,
+ pipeline_parallelism_degree=1
+ )
+
+# Create the FlexFlow LLM
+llm = ff.LLM("meta-llama/Llama-2-7b-hf")
+
+# Create the sampling configs
+generation_config = ff.GenerationConfig(
+ do_sample=True, temperature=0.9, topp=0.8, topk=1
+)
+
+# Compile the LLM for inference and load the weights into memory
+llm.compile(generation_config,
+ max_requests_per_batch = 16,
+ max_seq_length = 256,
+ max_tokens_per_batch = 128)
+
+# Generation begins!
+llm.start_server()
+result = llm.generate("Here are some travel tips for Tokyo:\n")
+llm.stop_server() # This invocation is optional
+```
+
+
+
+### C++ interface
+If you'd like to use the C++ interface (mostly used for development and benchmarking purposes), you should install from source, and follow the instructions below.
+
+
+Expand here
+
+
+#### Downloading models
+Before running FlexFlow Serve, you should manually download the LLM and SSM(s) model of interest using the [inference/utils/download_hf_model.py](https://github.com/flexflow/FlexFlow/blob/inference/inference/utils/download_hf_model.py) script (see example below). By default, the script will download all of a model's assets (weights, configs, tokenizer files, etc...) into the cache folder `~/.cache/flexflow`. If you would like to use a different folder, you can request that via the parameter `--cache-folder`.
+
+```bash
+python3 ./inference/utils/download_hf_model.py ...
+```
+
+#### Running the C++ examples
+A C++ example is available at [this folder](../inference/spec_infer/). After building FlexFlow Serve, the executable will be available at `/build_dir/inference/spec_infer/spec_infer`. You can use the following command-line arguments to run FlexFlow Serve:
+
+* `-ll:gpu`: number of GPU processors to use on each node for serving an LLM (default: 0)
+* `-ll:fsize`: size of device memory on each GPU in MB
+* `-ll:zsize`: size of zero-copy memory (pinned DRAM with direct GPU access) in MB. FlexFlow Serve keeps a replica of the LLM parameters on zero-copy memory, and therefore requires that the zero-copy memory is sufficient for storing the LLM parameters.
+* `-llm-model`: the LLM model ID from HuggingFace (e.g. "meta-llama/Llama-2-7b-hf")
+* `-ssm-model`: the SSM model ID from HuggingFace (e.g. "JackFram/llama-160m"). You can use multiple `-ssm-model`s in the command line to launch multiple SSMs.
+* `-cache-folder`: the folder
+* `-data-parallelism-degree`, `-tensor-parallelism-degree` and `-pipeline-parallelism-degree`: parallelization degrees in the data, tensor, and pipeline dimensions. Their product must equal the number of GPUs available on the machine. When any of the three parallelism degree arguments is omitted, a default value of 1 will be used.
+* `-prompt`: (optional) path to the prompt file. FlexFlow Serve expects a json format file for prompts. In addition, users can also use the following API for registering requests:
+* `-output-file`: (optional) filepath to use to save the output of the model, together with the generation latency
+
+For example, you can use the following command line to serve a LLaMA-7B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-68M models for speculative inference.
+
+```bash
+./inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion
+```
+
+
+## Speculative Inference
+A key technique that enables FlexFlow Serve to accelerate LLM serving is speculative
+inference, which combines various collectively boost-tuned small speculative
+models (SSMs) to jointly predict the LLM’s outputs; the predictions are organized as a
+token tree, whose nodes each represent a candidate token sequence. The correctness
+of all candidate token sequences represented by a token tree is verified against the
+LLM’s output in parallel using a novel tree-based parallel decoding mechanism.
+FlexFlow Serve uses an LLM as a token tree verifier instead of an incremental decoder,
+which largely reduces the end-to-end inference latency and computational requirement
+for serving generative LLMs while provably preserving model quality.
+
+
+
+
+
+### Supported LLMs and SSMs
+
+FlexFlow Serve currently supports all HuggingFace models with the following architectures:
+* `LlamaForCausalLM` / `LLaMAForCausalLM` (e.g. LLaMA/LLaMA-2, Guanaco, Vicuna, Alpaca, ...)
+* `OPTForCausalLM` (models from the OPT family)
+* `RWForCausalLM` (models from the Falcon family)
+* `GPTBigCodeForCausalLM` (models from the Starcoder family)
+
+Below is a list of models that we have explicitly tested and for which a SSM may be available:
+
+| Model | Model id on HuggingFace | Boost-tuned SSMs |
+| :---- | :---- | :---- |
+| LLaMA-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-13B | decapoda-research/llama-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-30B | decapoda-research/llama-30b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-65B | decapoda-research/llama-65b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-2-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-2-13B | meta-llama/Llama-2-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-2-70B | meta-llama/Llama-2-70b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| OPT-6.7B | facebook/opt-6.7b | [OPT-125M](https://huggingface.co/facebook/opt-125m) |
+| OPT-13B | facebook/opt-13b | [OPT-125M](https://huggingface.co/facebook/opt-125m) |
+| OPT-30B | facebook/opt-30b | [OPT-125M](https://huggingface.co/facebook/opt-125m) |
+| OPT-66B | facebook/opt-66b | [OPT-125M](https://huggingface.co/facebook/opt-125m) |
+| Falcon-7B | tiiuae/falcon-7b | |
+| Falcon-40B | tiiuae/falcon-40b | |
+| StarCoder-7B | bigcode/starcoderbase-7b | |
+| StarCoder-15.5B | bigcode/starcoder | |
+
+### CPU Offloading
+FlexFlow Serve also offers offloading-based inference for running large models (e.g., llama-7B) on a single GPU. CPU offloading is a choice to save tensors in CPU memory, and only copy the tensor to GPU when doing calculation. Notice that now we selectively offload the largest weight tensors (weights tensor in Linear, Attention). Besides, since the small model occupies considerably less space, it it does not pose a bottleneck for GPU memory, the offloading will bring more runtime space and computational cost, so we only do the offloading for the large model. [TODO: update instructions] You can run the offloading example by enabling the `-offload` and `-offload-reserve-space-size` flags.
+
+### Quantization
+FlexFlow Serve supports int4 and int8 quantization. The compressed tensors are stored on the CPU side. Once copied to the GPU, these tensors undergo decompression and conversion back to their original precision. Please find the compressed weight files in our s3 bucket, or use [this script](../inference/utils/compress_llama_weights.py) from [FlexGen](https://github.com/FMInference/FlexGen) project to do the compression manually.
+
+### Prompt Datasets
+We provide five prompt datasets for evaluating FlexFlow Serve: [Chatbot instruction prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatbot.json), [ChatGPT Prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatgpt.json), [WebQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/webqa.json), [Alpaca](https://specinfer.s3.us-east-2.amazonaws.com/prompts/alpaca.json), and [PIQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/piqa.json).
+
+## TODOs
+
+FlexFlow Serve is under active development. We currently focus on the following tasks and strongly welcome all contributions from bug fixes to new features and extensions.
+
+* AMD benchmarking. We are actively working on benchmarking FlexFlow Serve on AMD GPUs and comparing it with the performance on NVIDIA GPUs.
+* Chatbot prompt templates and Multi-round conversations
+* Support for FastAPI server
+* Integration with LangChain for document question answering
+
+## Acknowledgements
+This project is initiated by members from CMU, Stanford, and UCSD. We will be continuing developing and supporting FlexFlow Serve. Please cite FlexFlow Serve as:
+
+``` bibtex
+@misc{miao2023specinfer,
+ title={SpecInfer: Accelerating Generative Large Language Model Serving with Speculative Inference and Token Tree Verification},
+ author={Xupeng Miao and Gabriele Oliaro and Zhihao Zhang and Xinhao Cheng and Zeyu Wang and Rae Ying Yee Wong and Alan Zhu and Lijie Yang and Xiaoxiang Shi and Chunan Shi and Zhuoming Chen and Daiyaan Arfeen and Reyna Abhyankar and Zhihao Jia},
+ year={2023},
+ eprint={2305.09781},
+ archivePrefix={arXiv},
+ primaryClass={cs.CL}
+}
+```
+
+## License
+FlexFlow uses Apache License 2.0.
diff --git a/.github/workflows/build-skip.yml b/.github/workflows/build-skip.yml
index b3ab69e9c1..8635c0d137 100644
--- a/.github/workflows/build-skip.yml
+++ b/.github/workflows/build-skip.yml
@@ -3,6 +3,7 @@ on:
pull_request:
paths-ignore:
- "include/**"
+ - "inference/**"
- "cmake/**"
- "config/**"
- "deps/**"
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index ada29c5798..ef5961bc87 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -3,6 +3,7 @@ on:
pull_request:
paths:
- "include/**"
+ - "inference/**"
- "cmake/**"
- "config/**"
- "deps/**"
@@ -15,6 +16,7 @@ on:
- "master"
paths:
- "include/**"
+ - "inference/**"
- "cmake/**"
- "config/**"
- "deps/**"
@@ -38,6 +40,8 @@ jobs:
matrix:
gpu_backend: ["cuda", "hip_rocm"]
fail-fast: false
+ env:
+ FF_GPU_BACKEND: ${{ matrix.gpu_backend }}
steps:
- name: Checkout Git Repository
uses: actions/checkout@v3
@@ -48,21 +52,23 @@ jobs:
run: .github/workflows/helpers/free_space_on_runner.sh
- name: Install CUDA
- uses: Jimver/cuda-toolkit@v0.2.11
+ uses: Jimver/cuda-toolkit@v0.2.16
+ if: ${{ matrix.gpu_backend == 'cuda' }}
id: cuda-toolkit
with:
- cuda: "11.8.0"
+ cuda: "12.1.1"
# Disable caching of the CUDA binaries, since it does not give us any significant performance improvement
use-github-cache: "false"
+ log-file-suffix: 'cmake_${{matrix.gpu_backend}}.txt'
- name: Install system dependencies
- run: FF_GPU_BACKEND=${{ matrix.gpu_backend }} .github/workflows/helpers/install_dependencies.sh
+ run: .github/workflows/helpers/install_dependencies.sh
- name: Install conda and FlexFlow dependencies
uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: flexflow
- environment-file: conda/environment.yml
+ environment-file: conda/flexflow.yml
auto-activate-base: false
- name: Build FlexFlow
@@ -70,17 +76,25 @@ jobs:
export CUDNN_DIR="$CUDA_PATH"
export CUDA_DIR="$CUDA_PATH"
export FF_HOME=$(pwd)
- export FF_GPU_BACKEND=${{ matrix.gpu_backend }}
export FF_CUDA_ARCH=70
+ export FF_HIP_ARCH=gfx1100,gfx1036
+ export hip_version=5.6
+ export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
+
+ if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then
+ export FF_BUILD_ALL_EXAMPLES=ON
+ export FF_BUILD_UNIT_TESTS=ON
+ else
+ export FF_BUILD_ALL_EXAMPLES=OFF
+ export FF_BUILD_UNIT_TESTS=OFF
+ fi
+
cores_available=$(nproc --all)
n_build_cores=$(( cores_available -1 ))
if (( $n_build_cores < 1 )) ; then n_build_cores=1 ; fi
mkdir build
cd build
- if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then
- export FF_BUILD_ALL_EXAMPLES=ON
- export FF_BUILD_UNIT_TESTS=ON
- fi
+
../config/config.linux
make -j $n_build_cores
@@ -89,25 +103,24 @@ jobs:
export CUDNN_DIR="$CUDA_PATH"
export CUDA_DIR="$CUDA_PATH"
export FF_HOME=$(pwd)
- export FF_GPU_BACKEND=${{ matrix.gpu_backend }}
export FF_CUDA_ARCH=70
- cd build
+ export FF_HIP_ARCH=gfx1100,gfx1036
+ export hip_version=5.6
+ export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
+
if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then
- export FF_BUILD_ALL_EXAMPLES=ON
+ export FF_BUILD_ALL_EXAMPLES=ON
export FF_BUILD_UNIT_TESTS=ON
+ else
+ export FF_BUILD_ALL_EXAMPLES=OFF
+ export FF_BUILD_UNIT_TESTS=OFF
fi
+
+ cd build
../config/config.linux
sudo make install
sudo ldconfig
- - name: Check availability of Python flexflow.core module
- if: ${{ matrix.gpu_backend == 'cuda' }}
- run: |
- export LD_LIBRARY_PATH="$CUDA_PATH/lib64/stubs:$LD_LIBRARY_PATH"
- sudo ln -s "$CUDA_PATH/lib64/stubs/libcuda.so" "$CUDA_PATH/lib64/stubs/libcuda.so.1"
- export CPU_ONLY_TEST=1
- python -c "import flexflow.core; exit()"
-
- name: Run C++ unit tests
if: ${{ matrix.gpu_backend == 'cuda' }}
run: |
@@ -115,9 +128,19 @@ jobs:
export CUDA_DIR="$CUDA_PATH"
export LD_LIBRARY_PATH="$CUDA_PATH/lib64/stubs:$LD_LIBRARY_PATH"
export FF_HOME=$(pwd)
+ sudo ln -s "$CUDA_PATH/lib64/stubs/libcuda.so" "$CUDA_PATH/lib64/stubs/libcuda.so.1"
cd build
./tests/unit/unit-test
+ - name: Check availability of flexflow modules in Python
+ run: |
+ if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then
+ export LD_LIBRARY_PATH="$CUDA_PATH/lib64/stubs:$LD_LIBRARY_PATH"
+ fi
+ # Remove build folder to check that the installed version can run independently of the build files
+ rm -rf build
+ python -c "import flexflow.core; import flexflow.serve as ff; exit()"
+
makefile-build:
name: Build FlexFlow with the Makefile
runs-on: ubuntu-20.04
@@ -134,11 +157,12 @@ jobs:
run: .github/workflows/helpers/free_space_on_runner.sh
- name: Install CUDA
- uses: Jimver/cuda-toolkit@v0.2.11
+ uses: Jimver/cuda-toolkit@v0.2.16
id: cuda-toolkit
with:
- cuda: "11.8.0"
+ cuda: "12.1.1"
use-github-cache: "false"
+ log-file-suffix: 'makefile_${{matrix.gpu_backend}}.txt'
- name: Install system dependencies
run: .github/workflows/helpers/install_dependencies.sh
@@ -147,7 +171,7 @@ jobs:
uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: flexflow
- environment-file: conda/environment.yml
+ environment-file: conda/flexflow.yml
auto-activate-base: false
- name: Build FlexFlow
@@ -163,5 +187,4 @@ jobs:
cd python
make -j $n_build_cores
- export CPU_ONLY_TEST=1
python -c 'import flexflow.core'
diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml
index 46c9bf3be2..fdf53e8254 100644
--- a/.github/workflows/clang-format-check.yml
+++ b/.github/workflows/clang-format-check.yml
@@ -10,7 +10,7 @@ jobs:
- check: "src"
exclude: '\.proto$'
- check: "include"
- - check: "nmt"
+ - check: "inference"
- check: "python"
- check: "scripts"
- check: "tests"
diff --git a/.github/workflows/docker-build-skip.yml b/.github/workflows/docker-build-skip.yml
index 59b584c6c4..e5d7de858f 100644
--- a/.github/workflows/docker-build-skip.yml
+++ b/.github/workflows/docker-build-skip.yml
@@ -13,27 +13,22 @@ concurrency:
cancel-in-progress: true
jobs:
- docker-build:
- name: Build and Install FlexFlow in a Docker Container
- runs-on: ubuntu-20.04
+ docker-build-rocm:
+ name: Build and Install FlexFlow in a Docker Container (ROCm backend)
+ runs-on: ubuntu-latest
strategy:
matrix:
- gpu_backend: ["cuda", "hip_rocm"]
- cuda_version: ["11.1", "11.2", "11.3", "11.5", "11.6", "11.7", "11.8"]
- # The CUDA version doesn't matter when building for hip_rocm, so we just pick one arbitrarily (11.8) to avoid building for hip_rocm once per number of CUDA version supported
- exclude:
- - gpu_backend: "hip_rocm"
- cuda_version: "11.1"
- - gpu_backend: "hip_rocm"
- cuda_version: "11.2"
- - gpu_backend: "hip_rocm"
- cuda_version: "11.3"
- - gpu_backend: "hip_rocm"
- cuda_version: "11.5"
- - gpu_backend: "hip_rocm"
- cuda_version: "11.6"
- - gpu_backend: "hip_rocm"
- cuda_version: "11.7"
+ hip_version: ["5.3", "5.4", "5.5", "5.6"]
+ fail-fast: false
+ steps:
+ - run: 'echo "No docker-build required"'
+
+ docker-build-cuda:
+ name: Build and Install FlexFlow in a Docker Container (CUDA backend)
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ cuda_version: ["11.1", "11.6", "11.7", "11.8", "12.0", "12.1", "12.2"]
fail-fast: false
steps:
- run: 'echo "No docker-build required"'
diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
index d059a0605f..eeaab0e0af 100644
--- a/.github/workflows/docker-build.yml
+++ b/.github/workflows/docker-build.yml
@@ -7,10 +7,11 @@ on:
- ".github/workflows/docker-build.yml"
push:
branches:
+ - "inference"
- "master"
schedule:
- # Run every week on Sunday at midnight PT (3am ET / 8am UTC) to keep the docker images updated
- - cron: "0 8 * * 0"
+ # At 00:00 on day-of-month 1, 14, and 28.
+ - cron: "0 0 1,14,28 * *"
workflow_dispatch:
# Cancel outdated workflows if they are still running
@@ -19,53 +20,121 @@ concurrency:
cancel-in-progress: true
jobs:
- docker-build:
- name: Build and Install FlexFlow in a Docker Container
+ rocm-builder-start:
+ name: Start an AWS instance to build the ROCM Docker images
+ runs-on: ubuntu-latest
+ if: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
+ env:
+ ROCM_BUILDER_INSTANCE_ID: ${{ secrets.ROCM_BUILDER_INSTANCE_ID }}
+ steps:
+ - name: Configure AWS credentials
+ uses: aws-actions/configure-aws-credentials@v1
+ with:
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+ aws-region: us-east-2
+
+ - name: Start EC2 instance
+ run: aws ec2 start-instances --instance-ids $ROCM_BUILDER_INSTANCE_ID
+
+ docker-build-rocm:
+ name: Build and Install FlexFlow in a Docker Container (ROCm backend)
runs-on: ubuntu-20.04
+ if: ${{ ( github.event_name != 'push' && github.event_name != 'schedule' && github.event_name != 'workflow_dispatch' ) || github.ref_name != 'inference' }}
+ env:
+ FF_GPU_BACKEND: "hip_rocm"
+ hip_version: 5.6
+ steps:
+ - name: Checkout Git Repository
+ uses: actions/checkout@v3
+ with:
+ submodules: recursive
+
+ - name: Free additional space on runner
+ run: .github/workflows/helpers/free_space_on_runner.sh
+
+ - name: Build Docker container
+ run: FF_HIP_ARCH="gfx1100,gfx1036" ./docker/build.sh flexflow
+
+ - name: Check availability of flexflow modules in Python
+ run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${hip_version}:latest -c "python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
+
+ keep-runner-registered:
+ name: Keep runner alive
+ if: ${{ github.event_name == 'schedule' }}
+ runs-on: [self-hosted, rocm_builder]
+ defaults:
+ run:
+ shell: bash -l {0} # required to use an activated conda environment
+ env:
+ CONDA: "3"
+ needs: rocm-builder-start
+ steps:
+ - name: Keep alive
+ run: |
+ echo "Keep self-hosted runner registered with Github"
+ sleep 10m
+
+ docker-build-and-publish-rocm:
+ name: Build and Deploy FlexFlow Docker Containers (ROCm backend)
+ needs: rocm-builder-start
+ runs-on: [self-hosted, rocm_builder]
+ if: ${{ ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
strategy:
matrix:
- gpu_backend: ["cuda", "hip_rocm"]
- cuda_version: ["11.1", "11.2", "11.3", "11.5", "11.6", "11.7", "11.8"]
- # The CUDA version doesn't matter when building for hip_rocm, so we just pick one arbitrarily (11.8) to avoid building for hip_rocm once per number of CUDA version supported
- exclude:
- - gpu_backend: "hip_rocm"
- cuda_version: "11.1"
- - gpu_backend: "hip_rocm"
- cuda_version: "11.2"
- - gpu_backend: "hip_rocm"
- cuda_version: "11.3"
- - gpu_backend: "hip_rocm"
- cuda_version: "11.5"
- - gpu_backend: "hip_rocm"
- cuda_version: "11.6"
- - gpu_backend: "hip_rocm"
- cuda_version: "11.7"
+ hip_version: ["5.3", "5.4", "5.5", "5.6"]
fail-fast: false
env:
- FF_GPU_BACKEND: ${{ matrix.gpu_backend }}
- cuda_version: ${{ matrix.cuda_version }}
- branch_name: ${{ github.head_ref || github.ref_name }}
+ FF_GPU_BACKEND: "hip_rocm"
+ hip_version: ${{ matrix.hip_version }}
steps:
- name: Checkout Git Repository
uses: actions/checkout@v3
with:
submodules: recursive
- - name: Free additional space on runner
+ - name: Build Docker container
+ # On push to inference, build for all compatible architectures, so that we can publish
+ # a pre-built general-purpose image. On all other cases, only build for one architecture
+ # to save time.
+ run: FF_HIP_ARCH=all ./docker/build.sh flexflow
+
+ - name: Check availability of flexflow modules in Python
+ run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${hip_version}:latest -c "python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
+
+ - name: Publish Docker environment image (on push to inference)
env:
- deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }}
- build_needed: ${{ matrix.gpu_backend == 'hip_rocm' || ( matrix.gpu_backend == 'cuda' && matrix.cuda_version == '11.8' ) }}
+ FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }}
run: |
- if [[ $deploy_needed == "true" || $build_needed == "true" ]]; then
- .github/workflows/helpers/free_space_on_runner.sh
- else
- echo "Skipping this step to save time"
- fi
+ ./docker/publish.sh flexflow-environment
+ ./docker/publish.sh flexflow
+
+ docker-build-cuda:
+ name: Build and Install FlexFlow in a Docker Container (CUDA backend)
+ runs-on: ubuntu-20.04
+ strategy:
+ matrix:
+ cuda_version: ["11.1", "11.6", "11.7", "11.8", "12.0", "12.1", "12.2"]
+ fail-fast: false
+ env:
+ FF_GPU_BACKEND: "cuda"
+ cuda_version: ${{ matrix.cuda_version }}
+ steps:
+ - name: Checkout Git Repository
+ if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
+ uses: actions/checkout@v3
+ with:
+ submodules: recursive
+
+ - name: Free additional space on runner
+ if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
+ run: .github/workflows/helpers/free_space_on_runner.sh
- name: Build Docker container
+ if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
env:
- deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }}
- build_needed: ${{ matrix.gpu_backend == 'hip_rocm' || ( matrix.gpu_backend == 'cuda' && matrix.cuda_version == '11.8' ) }}
+ deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
+ build_needed: ${{ matrix.cuda_version == '12.0' }}
run: |
# On push to inference, build for all compatible architectures, so that we can publish
# a pre-built general-purpose image. On all other cases, only build for one architecture
@@ -74,42 +143,45 @@ jobs:
export FF_CUDA_ARCH=all
./docker/build.sh flexflow
elif [[ $build_needed == "true" ]]; then
- export FF_CUDA_ARCH=70
+ export FF_CUDA_ARCH=86
./docker/build.sh flexflow
- else
- echo "Skipping build to save time"
fi
- - name: Check availability of Python flexflow.core module
- if: ${{ matrix.gpu_backend == 'cuda' }}
- env:
- deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }}
- build_needed: ${{ matrix.gpu_backend == 'hip_rocm' || ( matrix.gpu_backend == 'cuda' && matrix.cuda_version == '11.8' ) }}
- run: |
- if [[ $deploy_needed == "true" || $build_needed == "true" ]]; then
- docker run --env CPU_ONLY_TEST=1 --entrypoint /bin/bash flexflow-cuda-${cuda_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; exit()'"
- else
- echo "Skipping test to save time"
- fi
+ - name: Check availability of flexflow modules in Python
+ if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
+ run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${cuda_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
- name: Publish Docker environment image (on push to inference)
- if: github.repository_owner == 'flexflow'
+ if: ${{ github.repository_owner == 'flexflow' && ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
env:
FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }}
- deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }}
run: |
- if [[ $deploy_needed == "true" ]]; then
- ./docker/publish.sh flexflow-environment
- ./docker/publish.sh flexflow
- else
- echo "No need to update Docker containers in ghrc.io registry at this time."
- fi
+ ./docker/publish.sh flexflow-environment
+ ./docker/publish.sh flexflow
+
+ rocm-builder-stop:
+ needs: [docker-build-and-publish-rocm, keep-runner-registered]
+ if: ${{ always() && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
+ runs-on: ubuntu-latest
+ name: Stop the AWS instance we used to build the ROCM Docker images
+ env:
+ ROCM_BUILDER_INSTANCE_ID: ${{ secrets.ROCM_BUILDER_INSTANCE_ID }}
+ steps:
+ - name: Configure AWS credentials
+ uses: aws-actions/configure-aws-credentials@v1
+ with:
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+ aws-region: us-east-2
+
+ - name: Start EC2 instance
+ run: aws ec2 stop-instances --instance-ids $ROCM_BUILDER_INSTANCE_ID
notify-slack:
name: Notify Slack in case of failure
runs-on: ubuntu-20.04
- needs: docker-build
- if: ${{ failure() && github.event_name == 'schedule' && github.repository_owner == 'flexflow' }}
+ needs: [docker-build-cuda, docker-build-and-publish-rocm]
+ if: ${{ failure() && github.event_name == 'workflow_dispatch' && github.repository_owner == 'flexflow' }}
steps:
- name: Send Slack message
env:
diff --git a/.github/workflows/gpu-ci-daemon.yml b/.github/workflows/gpu-ci-daemon.yml
index 603b44c34e..b36e7b49e1 100644
--- a/.github/workflows/gpu-ci-daemon.yml
+++ b/.github/workflows/gpu-ci-daemon.yml
@@ -34,5 +34,6 @@ jobs:
run: |
pip3 install pip --upgrade
pip3 install pyopenssl --upgrade
+ pip3 install urllib3 --upgrade
pip3 install pygithub
python3 .github/workflows/helpers/gpu_ci_helper.py --daemon
diff --git a/.github/workflows/gpu-ci-skip.yml b/.github/workflows/gpu-ci-skip.yml
index 157f3c271a..f4cb950931 100644
--- a/.github/workflows/gpu-ci-skip.yml
+++ b/.github/workflows/gpu-ci-skip.yml
@@ -8,9 +8,15 @@ on:
- "python/**"
- "setup.py"
- "include/**"
+ - "inference/**"
- "src/**"
+ - "tests/inference/**"
+ - "conda/flexflow.yml"
- ".github/workflows/gpu-ci.yml"
- - "tests/multi_gpu_tests.sh"
+ - "tests/cpp_gpu_tests.sh"
+ - "tests/inference_tests.sh"
+ - "tests/training_tests.sh"
+ - "tests/python_interface_test.sh"
workflow_dispatch:
concurrency:
@@ -30,10 +36,18 @@ jobs:
needs: gpu-ci-concierge
steps:
- run: 'echo "No gpu-ci required"'
-
- gpu-ci-flexflow:
- name: Single Machine, Multiple GPUs Tests
+
+ inference-tests:
+ name: Inference Tests
runs-on: ubuntu-20.04
needs: gpu-ci-concierge
steps:
- run: 'echo "No gpu-ci required"'
+
+ training-tests:
+ name: Training Tests
+ runs-on: ubuntu-20.04
+ # if: ${{ github.event_name != 'pull_request' || github.base_ref != 'inference' }}
+ needs: inference-tests
+ steps:
+ - run: 'echo "No gpu-ci required"'
diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
index 3b679e9f20..00ca2df603 100644
--- a/.github/workflows/gpu-ci.yml
+++ b/.github/workflows/gpu-ci.yml
@@ -1,21 +1,10 @@
name: "gpu-ci"
on:
- pull_request:
- paths:
- - "cmake/**"
- - "config/**"
- - "deps/**"
- - "python/**"
- - "setup.py"
- - "include/**"
- - "src/**"
- - ".github/workflows/gpu-ci.yml"
- - "tests/cpp_gpu_tests.sh"
- - "tests/multi_gpu_tests.sh"
- - "tests/python_interface_test.sh"
+ schedule:
+ - cron: "0 0 1,14,28 * *" # At 00:00 on day-of-month 1, 14, and 28.
push:
branches:
- - "master"
+ - "inference"
paths:
- "cmake/**"
- "config/**"
@@ -23,10 +12,14 @@ on:
- "python/**"
- "setup.py"
- "include/**"
+ - "inference/**"
- "src/**"
+ - "tests/inference/**"
+ - "conda/flexflow.yml"
- ".github/workflows/gpu-ci.yml"
- "tests/cpp_gpu_tests.sh"
- - "tests/multi_gpu_tests.sh"
+ - "tests/inference_tests.sh"
+ - "tests/training_tests.sh"
- "tests/python_interface_test.sh"
workflow_dispatch:
@@ -48,12 +41,33 @@ jobs:
run: |
pip3 install pip --upgrade
pip3 install pyopenssl --upgrade
+ pip3 install urllib3 --upgrade
pip3 install pygithub
python3 .github/workflows/helpers/gpu_ci_helper.py
+ keep-runner-registered:
+ name: Keep runner alive
+ if: ${{ github.event_name == 'schedule' }}
+ runs-on: [self-hosted, gpu]
+ defaults:
+ run:
+ shell: bash -l {0} # required to use an activated conda environment
+ env:
+ CONDA: "3"
+ needs: gpu-ci-concierge
+ container:
+ image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+ options: --gpus all --shm-size=8192m
+ steps:
+ - name: Keep alive
+ run: |
+ echo "Keep self-hosted runner registered with Github"
+ sleep 10m
+
python-interface-check:
name: Check Python Interface
- runs-on: self-hosted
+ if: ${{ github.event_name != 'schedule' }}
+ runs-on: [self-hosted, gpu]
defaults:
run:
shell: bash -l {0} # required to use an activated conda environment
@@ -77,7 +91,7 @@ jobs:
with:
miniconda-version: "latest"
activate-environment: flexflow
- environment-file: conda/flexflow-cpu.yml
+ environment-file: conda/flexflow.yml
auto-activate-base: false
auto-update-conda: false
@@ -89,7 +103,7 @@ jobs:
run: |
export PATH=$CONDA_PREFIX/bin:$PATH
export FF_HOME=$(pwd)
- export FF_USE_PREBUILT_LEGION=OFF
+ export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
mkdir build
cd build
../config/config.linux
@@ -106,6 +120,7 @@ jobs:
run: |
export PATH=$CONDA_PREFIX/bin:$PATH
export FF_HOME=$(pwd)
+ export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
cd build
../config/config.linux
make install
@@ -124,45 +139,150 @@ jobs:
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
./tests/align/test_all_operators.sh
- gpu-ci-flexflow:
- name: Single Machine, Multiple GPUs Tests
- runs-on: self-hosted
- needs: python-interface-check
+ inference-tests:
+ name: Inference Tests
+ if: ${{ github.event_name != 'schedule' }}
+ runs-on: [self-hosted, gpu]
+ defaults:
+ run:
+ shell: bash -l {0} # required to use an activated conda environment
+ env:
+ CONDA: "3"
+ HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }}
+ needs: gpu-ci-concierge
+ container:
+ image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+ options: --gpus all --shm-size=8192m
+ steps:
+ - name: Install updated git version
+ run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git
+
+ - name: Checkout Git Repository
+ uses: actions/checkout@v3
+ with:
+ submodules: recursive
+
+ - name: Install conda and FlexFlow dependencies
+ uses: conda-incubator/setup-miniconda@v2
+ with:
+ miniconda-version: "latest"
+ activate-environment: flexflow
+ environment-file: conda/flexflow.yml
+ auto-activate-base: false
+
+ - name: Build FlexFlow
+ run: |
+ export PATH=$CONDA_PREFIX/bin:$PATH
+ export FF_HOME=$(pwd)
+ export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
+ export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
+ mkdir build
+ cd build
+ ../config/config.linux
+ make -j
+
+ - name: Run PEFT tests
+ run: |
+ export PATH=$CONDA_PREFIX/bin:$PATH
+ export CUDNN_DIR=/usr/local/cuda
+ export CUDA_DIR=/usr/local/cuda
+ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
+
+ source ./build/set_python_envs.sh
+ ./tests/peft_test.sh
+
+ - name: Run inference tests
+ env:
+ CPP_INFERENCE_TESTS: ${{ vars.CPP_INFERENCE_TESTS }}
+ run: |
+ export PATH=$CONDA_PREFIX/bin:$PATH
+ export FF_HOME=$(pwd)
+ export CUDNN_DIR=/usr/local/cuda
+ export CUDA_DIR=/usr/local/cuda
+ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
+
+ # GPT tokenizer test
+ # ./tests/gpt_tokenizer_test.sh
+
+ # Inference tests
+ source ./build/set_python_envs.sh
+ ./tests/inference_tests.sh
+
+ - name: Save inference output as an artifact
+ if: always()
+ run: |
+ cd inference
+ tar -zcvf output.tar.gz ./output
+
+ - name: Upload artifact
+ uses: actions/upload-artifact@v3
+ if: always()
+ with:
+ name: output
+ path: inference/output.tar.gz
+
+ # Github persists the .cache folder across different runs/containers
+ - name: Clear cache
+ if: always()
+ run: sudo rm -rf ~/.cache
+
+ training-tests:
+ name: Training Tests
+ if: ${{ github.event_name != 'schedule' }}
+ runs-on: [self-hosted, gpu]
+ # skip this time-consuming test for PRs to the inference branch
+ # if: ${{ github.event_name != 'pull_request' || github.base_ref != 'inference' }}
+ defaults:
+ run:
+ shell: bash -l {0} # required to use an activated conda environment
+ env:
+ CONDA: "3"
+ needs: inference-tests
container:
image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
options: --gpus all --shm-size=8192m
steps:
- name: Install updated git version
run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git
+
- name: Checkout Git Repository
uses: actions/checkout@v3
with:
submodules: recursive
+
+ - name: Install conda and FlexFlow dependencies
+ uses: conda-incubator/setup-miniconda@v2
+ with:
+ miniconda-version: "latest"
+ activate-environment: flexflow
+ environment-file: conda/flexflow.yml
+ auto-activate-base: false
- name: Build and Install FlexFlow
run: |
- export PATH=/opt/conda/bin:$PATH
+ export PATH=$CONDA_PREFIX/bin:$PATH
export FF_HOME=$(pwd)
export FF_BUILD_ALL_EXAMPLES=ON
- export FF_USE_PREBUILT_LEGION=OFF
+ export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
+ export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
pip install . --verbose
- name: Check FlexFlow Python interface (pip)
run: |
- export PATH=/opt/conda/bin:$PATH
+ export PATH=$CONDA_PREFIX/bin:$PATH
export FF_HOME=$(pwd)
- export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib
+ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
./tests/python_interface_test.sh after-installation
- name: Run multi-gpu tests
run: |
- export PATH=/opt/conda/bin:$PATH
+ export PATH=$CONDA_PREFIX/bin:$PATH
export CUDNN_DIR=/usr/local/cuda
export CUDA_DIR=/usr/local/cuda
export FF_HOME=$(pwd)
- export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib
+ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
# C++ tests
./tests/cpp_gpu_tests.sh 4
# Python tests
- ./tests/multi_gpu_tests.sh 4
+ ./tests/training_tests.sh 4
diff --git a/.github/workflows/helpers/install_cudnn.sh b/.github/workflows/helpers/install_cudnn.sh
index 318134e331..73b8e88418 100755
--- a/.github/workflows/helpers/install_cudnn.sh
+++ b/.github/workflows/helpers/install_cudnn.sh
@@ -5,8 +5,11 @@ set -x
# Cd into directory holding this script
cd "${BASH_SOURCE[0]%/*}"
+ubuntu_version=$(lsb_release -rs)
+ubuntu_version=${ubuntu_version//./}
+
# Install CUDNN
-cuda_version=${1:-11.8.0}
+cuda_version=${1:-12.1.1}
cuda_version=$(echo "${cuda_version}" | cut -f1,2 -d'.')
echo "Installing CUDNN for CUDA version: ${cuda_version} ..."
CUDNN_LINK=http://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-11.1-linux-x64-v8.0.5.39.tgz
@@ -44,6 +47,12 @@ elif [[ "$cuda_version" == "11.7" ]]; then
elif [[ "$cuda_version" == "11.8" ]]; then
CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz
CUDNN_TARBALL_NAME=cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz
+elif [[ "$cuda_version" == "12.0" || "$cuda_version" == "12.1" || "$cuda_version" == "12.2" || "$cuda_version" == "12.3" || "$cuda_version" == "12.4" || "$cuda_version" == "12.5" ]]; then
+ CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.8.0/local_installers/12.0/cudnn-local-repo-ubuntu2004-8.8.0.121_1.0-1_amd64.deb
+ CUDNN_TARBALL_NAME=cudnn-local-repo-ubuntu2004-8.8.0.121_1.0-1_amd64.deb
+else
+ echo "CUDNN support for CUDA version above 12.5 not yet added"
+ exit 1
fi
wget -c -q $CUDNN_LINK
if [[ "$cuda_version" == "11.6" || "$cuda_version" == "11.7" || "$cuda_version" == "11.8" ]]; then
@@ -52,6 +61,17 @@ if [[ "$cuda_version" == "11.6" || "$cuda_version" == "11.7" || "$cuda_version"
sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME"/include/* /usr/local/include
sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME"/lib/* /usr/local/lib
rm -rf "$CUDNN_EXTRACTED_TARBALL_NAME"
+elif [[ "$CUDNN_TARBALL_NAME" == *.deb ]]; then
+ wget -c -q "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.1-1_all.deb"
+ sudo dpkg -i cuda-keyring_1.1-1_all.deb
+ sudo apt update -y
+ rm -f cuda-keyring_1.1-1_all.deb
+ sudo dpkg -i $CUDNN_TARBALL_NAME
+ sudo cp /var/cudnn-local-repo-ubuntu2004-8.8.0.121/cudnn-local-A9E17745-keyring.gpg /usr/share/keyrings/
+ sudo apt update -y
+ sudo apt install -y libcudnn8
+ sudo apt install -y libcudnn8-dev
+ sudo apt install -y libcudnn8-samples
else
sudo tar -xzf $CUDNN_TARBALL_NAME -C /usr/local
fi
diff --git a/.github/workflows/helpers/install_dependencies.sh b/.github/workflows/helpers/install_dependencies.sh
index 5ab211c962..6435a37eea 100755
--- a/.github/workflows/helpers/install_dependencies.sh
+++ b/.github/workflows/helpers/install_dependencies.sh
@@ -7,24 +7,61 @@ cd "${BASH_SOURCE[0]%/*}"
# General dependencies
echo "Installing apt dependencies..."
-sudo apt-get update && sudo apt-get install -y --no-install-recommends wget binutils git zlib1g-dev libhdf5-dev && \
+sudo apt-get update && sudo apt-get install -y --no-install-recommends wget binutils git zlib1g-dev libhdf5-dev jq && \
sudo rm -rf /var/lib/apt/lists/*
-# Install CUDNN
-./install_cudnn.sh
-
-# Install HIP dependencies if needed
FF_GPU_BACKEND=${FF_GPU_BACKEND:-"cuda"}
+hip_version=${hip_version:-"5.6"}
if [[ "${FF_GPU_BACKEND}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then
echo "Error, value of FF_GPU_BACKEND (${FF_GPU_BACKEND}) is invalid."
exit 1
-elif [[ "$FF_GPU_BACKEND" == "hip_cuda" || "$FF_GPU_BACKEND" = "hip_rocm" ]]; then
+fi
+# Install CUDNN if needed
+if [[ "$FF_GPU_BACKEND" == "cuda" || "$FF_GPU_BACKEND" = "hip_cuda" ]]; then
+ # Install CUDNN
+ ./install_cudnn.sh
+ # Install NCCL
+ ./install_nccl.sh
+fi
+# Install HIP dependencies if needed
+if [[ "$FF_GPU_BACKEND" == "hip_cuda" || "$FF_GPU_BACKEND" = "hip_rocm" ]]; then
echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Installing HIP dependencies"
- wget https://repo.radeon.com/amdgpu-install/22.20.5/ubuntu/focal/amdgpu-install_22.20.50205-1_all.deb
- sudo apt-get install -y ./amdgpu-install_22.20.50205-1_all.deb
- rm ./amdgpu-install_22.20.50205-1_all.deb
+ # Check that hip_version is one of 5.3,5.4,5.5,5.6
+ if [[ "$hip_version" != "5.3" && "$hip_version" != "5.4" && "$hip_version" != "5.5" && "$hip_version" != "5.6" ]]; then
+ echo "hip_version '${hip_version}' is not supported, please choose among {5.3, 5.4, 5.5, 5.6}"
+ exit 1
+ fi
+ # Compute script name and url given the version
+ AMD_GPU_SCRIPT_NAME=amdgpu-install_5.6.50600-1_all.deb
+ if [ "$hip_version" = "5.3" ]; then
+ AMD_GPU_SCRIPT_NAME=amdgpu-install_5.3.50300-1_all.deb
+ elif [ "$hip_version" = "5.4" ]; then
+ AMD_GPU_SCRIPT_NAME=amdgpu-install_5.4.50400-1_all.deb
+ elif [ "$hip_version" = "5.5" ]; then
+ AMD_GPU_SCRIPT_NAME=amdgpu-install_5.5.50500-1_all.deb
+ fi
+ AMD_GPU_SCRIPT_URL="https://repo.radeon.com/amdgpu-install/${hip_version}/ubuntu/focal/${AMD_GPU_SCRIPT_NAME}"
+ # Download and install AMD GPU software with ROCM and HIP support
+ wget "$AMD_GPU_SCRIPT_URL"
+ sudo apt-get install -y ./${AMD_GPU_SCRIPT_NAME}
+ sudo rm ./${AMD_GPU_SCRIPT_NAME}
sudo amdgpu-install -y --usecase=hip,rocm --no-dkms
- sudo apt-get install -y hip-dev hipblas miopen-hip rocm-hip-sdk
+ sudo apt-get install -y hip-dev hipblas miopen-hip rocm-hip-sdk rocm-device-libs
+
+ # Install protobuf v3.20.x manually
+ sudo apt-get update -y && sudo apt-get install -y pkg-config zip g++ zlib1g-dev unzip python autoconf automake libtool curl make
+ git clone -b 3.20.x https://github.com/protocolbuffers/protobuf.git
+ cd protobuf/
+ git submodule update --init --recursive
+ ./autogen.sh
+ ./configure
+ cores_available=$(nproc --all)
+ n_build_cores=$(( cores_available -1 ))
+ if (( n_build_cores < 1 )) ; then n_build_cores=1 ; fi
+ make -j $n_build_cores
+ sudo make install
+ sudo ldconfig
+ cd ..
else
echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping installing HIP dependencies"
fi
diff --git a/.github/workflows/helpers/install_nccl.sh b/.github/workflows/helpers/install_nccl.sh
new file mode 100755
index 0000000000..ae6793ea2a
--- /dev/null
+++ b/.github/workflows/helpers/install_nccl.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+set -euo pipefail
+set -x
+
+# Cd into directory holding this script
+cd "${BASH_SOURCE[0]%/*}"
+
+# Add NCCL key ring
+ubuntu_version=$(lsb_release -rs)
+ubuntu_version=${ubuntu_version//./}
+wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.1-1_all.deb"
+sudo dpkg -i cuda-keyring_1.1-1_all.deb
+sudo apt update -y
+rm -f cuda-keyring_1.1-1_all.deb
+
+# Install NCCL
+cuda_version=${1:-12.1.1}
+cuda_version=$(echo "${cuda_version}" | cut -f1,2 -d'.')
+echo "Installing NCCL for CUDA version: ${cuda_version} ..."
+
+# We need to run a different install command based on the CUDA version, otherwise running `sudo apt install libnccl2 libnccl-dev`
+# will automatically upgrade CUDA to the latest version.
+
+if [[ "$cuda_version" == "11.0" ]]; then
+ sudo apt install libnccl2=2.15.5-1+cuda11.0 libnccl-dev=2.15.5-1+cuda11.0
+elif [[ "$cuda_version" == "11.1" ]]; then
+ sudo apt install libnccl2=2.8.4-1+cuda11.1 libnccl-dev=2.8.4-1+cuda11.1
+elif [[ "$cuda_version" == "11.2" ]]; then
+ sudo apt install libnccl2=2.8.4-1+cuda11.2 libnccl-dev=2.8.4-1+cuda11.2
+elif [[ "$cuda_version" == "11.3" ]]; then
+ sudo apt install libnccl2=2.9.9-1+cuda11.3 libnccl-dev=2.9.9-1+cuda11.3
+elif [[ "$cuda_version" == "11.4" ]]; then
+ sudo apt install libnccl2=2.11.4-1+cuda11.4 libnccl-dev=2.11.4-1+cuda11.4
+elif [[ "$cuda_version" == "11.5" ]]; then
+ sudo apt install libnccl2=2.11.4-1+cuda11.5 libnccl-dev=2.11.4-1+cuda11.5
+elif [[ "$cuda_version" == "11.6" ]]; then
+ sudo apt install libnccl2=2.12.12-1+cuda11.6 libnccl-dev=2.12.12-1+cuda11.6
+elif [[ "$cuda_version" == "11.7" ]]; then
+ sudo apt install libnccl2=2.14.3-1+cuda11.7 libnccl-dev=2.14.3-1+cuda11.7
+elif [[ "$cuda_version" == "11.8" ]]; then
+ sudo apt install libnccl2=2.16.5-1+cuda11.8 libnccl-dev=2.16.5-1+cuda11.8
+elif [[ "$cuda_version" == "12.0" ]]; then
+ sudo apt install libnccl2=2.18.3-1+cuda12.0 libnccl-dev=2.18.3-1+cuda12.0
+elif [[ "$cuda_version" == "12.1" ]]; then
+ sudo apt install libnccl2=2.18.3-1+cuda12.1 libnccl-dev=2.18.3-1+cuda12.1
+elif [[ "$cuda_version" == "12.2" ]]; then
+ sudo apt install libnccl2=2.18.3-1+cuda12.2 libnccl-dev=2.18.3-1+cuda12.2
+else
+ echo "Installing NCCL for CUDA version ${cuda_version} is not supported"
+ exit 1
+fi
diff --git a/.github/workflows/helpers/oracle_con.py b/.github/workflows/helpers/oracle_con.py
new file mode 100644
index 0000000000..0891d66e99
--- /dev/null
+++ b/.github/workflows/helpers/oracle_con.py
@@ -0,0 +1,37 @@
+import oci
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description="Program with optional flags")
+group = parser.add_mutually_exclusive_group()
+group.add_argument("--start", action="store_true", help="Start action")
+group.add_argument("--stop", action="store_true", help="Stop action")
+parser.add_argument("--instance_id", type=str, required=True, help="instance id required")
+args = parser.parse_args()
+
+oci_key_content = os.getenv("OCI_CLI_KEY_CONTENT")
+
+config = {
+ "user": os.getenv("OCI_CLI_USER"),
+ "key_content": os.getenv("OCI_CLI_KEY_CONTENT"),
+ "fingerprint": os.getenv("OCI_CLI_FINGERPRINT"),
+ "tenancy": os.getenv("OCI_CLI_TENANCY"),
+ "region": os.getenv("OCI_CLI_REGION")
+}
+
+# Initialize the OCI configuration
+oci.config.validate_config(config)
+
+# Initialize the ComputeClient to interact with VM instances
+compute = oci.core.ComputeClient(config)
+
+# Replace 'your_instance_id' with the actual instance ID of your VM
+instance_id = args.instance_id
+
+# Perform the action
+if args.start:
+ # Start the VM
+ compute.instance_action(instance_id, "START")
+else:
+ # Stop the VM
+ compute.instance_action(instance_id, "STOP")
diff --git a/.github/workflows/helpers/prebuild_legion.sh b/.github/workflows/helpers/prebuild_legion.sh
new file mode 100755
index 0000000000..9f5cbe147a
--- /dev/null
+++ b/.github/workflows/helpers/prebuild_legion.sh
@@ -0,0 +1,75 @@
+#! /usr/bin/env bash
+set -euo pipefail
+
+# Parse input params
+python_version=${python_version:-"empty"}
+gpu_backend=${gpu_backend:-"empty"}
+gpu_backend_version=${gpu_backend_version:-"empty"}
+
+if [[ "${gpu_backend}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then
+ echo "Error, value of gpu_backend (${gpu_backend}) is invalid. Pick between 'cuda', 'hip_cuda', 'hip_rocm' or 'intel'."
+ exit 1
+else
+ echo "Pre-building Legion with GPU backend: ${gpu_backend}"
+fi
+
+if [[ "${gpu_backend}" == "cuda" || "${gpu_backend}" == "hip_cuda" ]]; then
+ # Check that CUDA version is supported. Versions above 12.0 not supported because we don't publish docker images for it yet.
+ if [[ "$gpu_backend_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0) ]]; then
+ echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0}"
+ exit 1
+ fi
+ export cuda_version="$gpu_backend_version"
+elif [[ "${gpu_backend}" == "hip_rocm" ]]; then
+ # Check that HIP version is supported
+ if [[ "$gpu_backend_version" != @(5.3|5.4|5.5|5.6) ]]; then
+ echo "hip_version is not supported, please choose among {5.3, 5.4, 5.5, 5.6}"
+ exit 1
+ fi
+ export hip_version="$gpu_backend_version"
+else
+ echo "gpu backend: ${gpu_backend} and gpu_backend_version: ${gpu_backend_version} not yet supported."
+ exit 1
+fi
+
+# Cd into directory holding this script
+cd "${BASH_SOURCE[0]%/*}"
+
+export FF_GPU_BACKEND="${gpu_backend}"
+export FF_CUDA_ARCH=all
+export FF_HIP_ARCH=all
+export BUILD_LEGION_ONLY=ON
+export INSTALL_DIR="/usr/legion"
+export python_version="${python_version}"
+
+# Build Docker Flexflow Container
+echo "building docker"
+../../../docker/build.sh flexflow
+
+# Cleanup any existing container with the same name
+docker rm prelegion || true
+
+# Create container to be able to copy data from the image
+docker create --name prelegion flexflow-"${gpu_backend}"-"${gpu_backend_version}":latest
+
+# Copy legion libraries to host
+echo "extract legion library assets"
+mkdir -p ../../../prebuilt_legion_assets
+rm -rf ../../../prebuilt_legion_assets/tmp || true
+docker cp prelegion:$INSTALL_DIR ../../../prebuilt_legion_assets/tmp
+
+
+# Create the tarball file
+cd ../../../prebuilt_legion_assets/tmp
+export LEGION_TARBALL="legion_ubuntu-20.04_${gpu_backend}-${gpu_backend_version}_py${python_version}.tar.gz"
+
+echo "Creating archive $LEGION_TARBALL"
+tar -zcvf "../$LEGION_TARBALL" ./
+cd ..
+echo "Checking the size of the Legion tarball..."
+du -h "$LEGION_TARBALL"
+
+
+# Cleanup
+rm -rf tmp/*
+docker rm prelegion
diff --git a/.github/workflows/multinode-test.yml b/.github/workflows/multinode-test.yml
index 37f81b615f..2fc527bf08 100644
--- a/.github/workflows/multinode-test.yml
+++ b/.github/workflows/multinode-test.yml
@@ -25,6 +25,7 @@ jobs:
run: |
pip3 install pip --upgrade
pip3 install pyopenssl --upgrade
+ pip3 install urllib3 --upgrade
pip3 install pygithub
python3 .github/workflows/helpers/gpu_ci_helper.py
@@ -37,7 +38,7 @@ jobs:
# 10h timeout, instead of default of 360min (6h)
timeout-minutes: 600
container:
- image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+ image: ghcr.io/flexflow/flexflow-environment-cuda-12.0:latest
options: --gpus all --shm-size=8192m
steps:
- name: Install updated git version
@@ -77,7 +78,7 @@ jobs:
export OMPI_ALLOW_RUN_AS_ROOT=1
export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
export OMPI_MCA_btl_vader_single_copy_mechanism=none
- ./tests/multi_gpu_tests.sh 2 2
+ ./tests/training_tests.sh 2 2
multinode-gpu-test-ucx:
name: Multinode GPU Test with UCX
@@ -86,7 +87,7 @@ jobs:
runs-on: self-hosted
needs: gpu-ci-concierge
container:
- image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+ image: ghcr.io/flexflow/flexflow-environment-cuda-12.0:latest
options: --gpus all --shm-size=8192m
# 10h timeout, instead of default of 360min (6h)
timeout-minutes: 600
@@ -128,7 +129,7 @@ jobs:
export OMPI_ALLOW_RUN_AS_ROOT=1
export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
export OMPI_MCA_btl_vader_single_copy_mechanism=none
- ./tests/multi_gpu_tests.sh 2 2
+ ./tests/training_tests.sh 2 2
multinode-gpu-test-native-ucx:
name: Multinode GPU Test with native UCX
@@ -137,7 +138,7 @@ jobs:
runs-on: self-hosted
needs: gpu-ci-concierge
container:
- image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+ image: ghcr.io/flexflow/flexflow-environment-cuda-12.0:latest
options: --gpus all --shm-size=8192m
steps:
- name: Install updated git version
@@ -176,7 +177,7 @@ jobs:
export OMPI_ALLOW_RUN_AS_ROOT=1
export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
export OMPI_MCA_btl_vader_single_copy_mechanism=none
- ./tests/multi_gpu_tests.sh 2 2
+ ./tests/training_tests.sh 2 2
notify-slack:
name: Notify Slack in case of failure
diff --git a/.github/workflows/pip-install-skip.yml b/.github/workflows/pip-install-skip.yml
index f2606b94d8..92c3223e32 100644
--- a/.github/workflows/pip-install-skip.yml
+++ b/.github/workflows/pip-install-skip.yml
@@ -7,6 +7,7 @@ on:
- "deps/**"
- "python/**"
- "setup.py"
+ - "requirements.txt"
- ".github/workflows/helpers/install_dependencies.sh"
- ".github/workflows/pip-install.yml"
workflow_dispatch:
diff --git a/.github/workflows/pip-install.yml b/.github/workflows/pip-install.yml
index 7d60d3bf52..d5acbfc2e1 100644
--- a/.github/workflows/pip-install.yml
+++ b/.github/workflows/pip-install.yml
@@ -7,6 +7,7 @@ on:
- "deps/**"
- "python/**"
- "setup.py"
+ - "requirements.txt"
- ".github/workflows/helpers/install_dependencies.sh"
- ".github/workflows/pip-install.yml"
push:
@@ -18,6 +19,7 @@ on:
- "deps/**"
- "python/**"
- "setup.py"
+ - "requirements.txt"
- ".github/workflows/helpers/install_dependencies.sh"
- ".github/workflows/pip-install.yml"
workflow_dispatch:
@@ -42,10 +44,10 @@ jobs:
run: .github/workflows/helpers/free_space_on_runner.sh
- name: Install CUDA
- uses: Jimver/cuda-toolkit@v0.2.11
+ uses: Jimver/cuda-toolkit@v0.2.16
id: cuda-toolkit
with:
- cuda: "11.8.0"
+ cuda: "12.1.1"
# Disable caching of the CUDA binaries, since it does not give us any significant performance improvement
use-github-cache: "false"
@@ -64,10 +66,11 @@ jobs:
export FF_HOME=$(pwd)
export FF_CUDA_ARCH=70
pip install . --verbose
+ # Remove build folder to check that the installed version can run independently of the build files
+ rm -rf build
- - name: Check availability of Python flexflow.core module
+ - name: Check availability of flexflow modules in Python
run: |
export LD_LIBRARY_PATH="$CUDA_PATH/lib64/stubs:$LD_LIBRARY_PATH"
sudo ln -s "$CUDA_PATH/lib64/stubs/libcuda.so" "$CUDA_PATH/lib64/stubs/libcuda.so.1"
- export CPU_ONLY_TEST=1
- python -c "import flexflow.core; exit()"
+ python -c 'import flexflow.core; import flexflow.serve as ff; exit()'
diff --git a/.github/workflows/prebuild-legion.yml b/.github/workflows/prebuild-legion.yml
new file mode 100644
index 0000000000..633fb00eb8
--- /dev/null
+++ b/.github/workflows/prebuild-legion.yml
@@ -0,0 +1,84 @@
+name: "prebuild-legion"
+on:
+ push:
+ branches:
+ - "inference"
+ paths:
+ - "cmake/**"
+ - "config/**"
+ - "deps/legion/**"
+ - ".github/workflows/helpers/install_dependencies.sh"
+ workflow_dispatch:
+concurrency:
+ group: prebuild-legion-${{ github.head_ref || github.run_id }}
+ cancel-in-progress: true
+
+jobs:
+ prebuild-legion:
+ name: Prebuild Legion with CMake
+ runs-on: ubuntu-20.04
+ defaults:
+ run:
+ shell: bash -l {0} # required to use an activated conda environment
+ strategy:
+ matrix:
+ gpu_backend: ["cuda", "hip_rocm"]
+ gpu_backend_version: ["12.0", "5.6"]
+ python_version: ["3.11"]
+ exclude:
+ - gpu_backend: "cuda"
+ gpu_backend_version: "5.6"
+ - gpu_backend: "hip_rocm"
+ gpu_backend_version: "12.0"
+ fail-fast: false
+ steps:
+ - name: Checkout Git Repository
+ uses: actions/checkout@v3
+ with:
+ submodules: recursive
+
+ - name: Free additional space on runner
+ run: .github/workflows/helpers/free_space_on_runner.sh
+
+ - name: Build Legion
+ env:
+ gpu_backend: ${{ matrix.gpu_backend }}
+ gpu_backend_version: ${{ matrix.gpu_backend_version }}
+ python_version: ${{ matrix.python_version }}
+ run: .github/workflows/helpers/prebuild_legion.sh
+
+ - name: Archive compiled Legion library (CUDA)
+ uses: actions/upload-artifact@v3
+ with:
+ name: legion_ubuntu-20.04_${{ matrix.gpu_backend }}-${{ matrix.gpu_backend_version }}_py${{ matrix.python_version }}
+ path: prebuilt_legion_assets/legion_ubuntu-20.04_${{ matrix.gpu_backend }}-${{ matrix.gpu_backend_version }}_py${{ matrix.python_version }}.tar.gz
+
+ create-release:
+ name: Create new release
+ runs-on: ubuntu-20.04
+ needs: prebuild-legion
+ steps:
+ - name: Checkout Git Repository
+ uses: actions/checkout@v3
+ - name: Free additional space on runner
+ run: .github/workflows/helpers/free_space_on_runner.sh
+ - name: Create folder for artifacts
+ run: mkdir artifacts unwrapped_artifacts
+ - name: Download artifacts
+ uses: actions/download-artifact@v3
+ with:
+ path: ./artifacts
+ - name: Display structure of downloaded files
+ working-directory: ./artifacts
+ run: ls -R
+ - name: Unwrap all artifacts
+ working-directory: ./artifacts
+ run: find . -maxdepth 2 -mindepth 2 -type f -name "*.tar.gz" -exec mv {} ../unwrapped_artifacts/ \;
+ - name: Get datetime
+ run: echo "RELEASE_DATETIME=$(date '+%Y-%m-%dT%H-%M-%S')" >> $GITHUB_ENV
+ - name: Release
+ env:
+ NAME: ${{ env.RELEASE_DATETIME }}
+ TAG_NAME: ${{ env.RELEASE_DATETIME }}
+ GITHUB_TOKEN: ${{ secrets.FLEXFLOW_TOKEN }}
+ run: gh release create $TAG_NAME ./unwrapped_artifacts/*.tar.gz --repo flexflow/flexflow-third-party
diff --git a/.gitignore b/.gitignore
index 20d3979b08..cc34c1a7b6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,11 @@ __pycache__/
# C extensions
*.so
+/inference/weights/*
+/inference/tokenizer/*
+/inference/prompt/*
+/inference/output/*
+
# Distribution / packaging
.Python
build/
@@ -83,10 +88,7 @@ docs/build/
# Doxygen documentation
docs/doxygen/output/
-
-# Exhale documentation
-docs/source/_doxygen/
-docs/source/c++_api/
+docs/doxygen/cpp_api/
# PyBuilder
.pybuilder/
@@ -179,6 +181,15 @@ train-labels-idx1-ubyte
# Logs
logs/
+gpt_tokenizer
# pip version
python/flexflow/version.txt
+
+inference_tensors
+hf_peft_tensors
+lora_training_logs
+
+Untitled-1.ipynb
+Untitled-2.ipynb
+tests/inference/python_test_configs/*.json
diff --git a/.gitmodules b/.gitmodules
index b8419fda94..c68582d4ac 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -19,3 +19,7 @@
[submodule "deps/json"]
path = deps/json
url = https://github.com/nlohmann/json.git
+[submodule "deps/tokenizers-cpp"]
+ path = deps/tokenizers-cpp
+ url = https://github.com/mlc-ai/tokenizers-cpp.git
+ fetchRecurseSubmodules = true
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8ad3b81f9c..f06969ae04 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,7 @@
cmake_minimum_required(VERSION 3.10)
project(FlexFlow)
+
include(ExternalProject)
# Set policy CMP0074 to eliminate cmake warnings
@@ -12,7 +13,21 @@ if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
endif()
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_LIST_DIR}/cmake)
set(FLEXFLOW_ROOT ${CMAKE_CURRENT_LIST_DIR})
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -UNDEBUG")
+set(CMAKE_CXX_FLAGS "-std=c++17 ${CMAKE_CXX_FLAGS} -fPIC -UNDEBUG")
+set(CMAKE_HIP_FLAGS "-std=c++17 ${CMAKE_HIP_FLAGS} -fPIC -UNDEBUG")
+
+# set std 17
+#set(CMAKE_CXX_STANDARD 17)
+#set(CMAKE_CUDA_STANDARD 17)
+
+option(INFERENCE_TESTS "Run inference tests" OFF)
+set(LIBTORCH_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../libtorch" CACHE STRING "LibTorch Path")
+if (INFERENCE_TESTS)
+ find_package(Torch REQUIRED PATHS ${LIBTORCH_PATH} NO_DEFAULT_PATH)
+ set(CMAKE_CXX_FLAGS "-std=c++17 ${CMAKE_CXX_FLAGS} -fPIC ${TORCH_CXX_FLAGS}")
+ message(STATUS "LIBTORCH_PATH: ${LIBTORCH_PATH}")
+ message(STATUS "TORCH_LIBRARIES: ${TORCH_LIBRARIES}")
+endif()
# Set a default build type if none was specified
set(default_build_type "Debug")
@@ -22,8 +37,33 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
STRING "Choose the type of build." FORCE)
endif()
+# option for using Python
+option(FF_USE_PYTHON "Enable Python" ON)
+if (FF_USE_PYTHON)
+ find_package(Python3 COMPONENTS Interpreter Development)
+endif()
+
+if(INSTALL_DIR)
+ message(STATUS "INSTALL_DIR: ${INSTALL_DIR}")
+ set(CMAKE_INSTALL_PREFIX ${INSTALL_DIR} CACHE PATH "Installation directory" FORCE)
+else()
+ # Install DIR not set. Use default, unless a conda environment is in use
+ if ((DEFINED ENV{CONDA_PREFIX} OR (Python3_EXECUTABLE AND Python3_EXECUTABLE MATCHES "conda")) AND NOT FF_BUILD_FROM_PYPI)
+ if (DEFINED ENV{CONDA_PREFIX})
+ set(CONDA_PREFIX $ENV{CONDA_PREFIX})
+ else()
+ get_filename_component(CONDA_PREFIX "${Python3_EXECUTABLE}" DIRECTORY)
+ get_filename_component(CONDA_PREFIX "${CONDA_PREFIX}" DIRECTORY)
+ endif()
+ # Set CMAKE_INSTALL_PREFIX to the Conda environment's installation path
+ set(CMAKE_INSTALL_PREFIX ${CONDA_PREFIX} CACHE PATH "Installation directory" FORCE)
+ message(STATUS "Active conda environment detected. Setting CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}")
+ endif()
+endif()
+
# do not disable assertions even if in release mode
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG")
+set(CMAKE_HIP_FLAGS_RELEASE "${CMAKE_HIP_FLAGS_RELEASE} -UNDEBUG")
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
set(LIBEXT ".so")
@@ -35,114 +75,23 @@ option(FF_BUILD_FROM_PYPI "Build from pypi" OFF)
# build shared or static flexflow lib
option(BUILD_SHARED_LIBS "Build shared libraries instead of static ones" ON)
-# option for using Python
-option(FF_USE_PYTHON "Enable Python" ON)
+# option for building legion only
+option(BUILD_LEGION_ONLY "Build Legion only" OFF)
# option to download pre-compiled NCCL/Legion libraries
option(FF_USE_PREBUILT_NCCL "Enable use of NCCL pre-compiled library, if available" ON)
option(FF_USE_PREBUILT_LEGION "Enable use of Legion pre-compiled library, if available" ON)
option(FF_USE_ALL_PREBUILT_LIBRARIES "Enable use of all pre-compiled libraries, if available" OFF)
-# option for using Python
-set(FF_GASNET_CONDUITS aries udp mpi ibv ucx)
+# option for using network
+set(FF_GASNET_CONDUITS aries udp mpi ibv)
set(FF_GASNET_CONDUIT "mpi" CACHE STRING "Select GASNet conduit ${FF_GASNET_CONDUITS}")
set_property(CACHE FF_GASNET_CONDUIT PROPERTY STRINGS ${FF_GASNET_CONDUITS})
set(FF_LEGION_NETWORKS "" CACHE STRING "Network backend(s) to use")
-if ((FF_LEGION_NETWORKS STREQUAL "gasnet" AND FF_GASNET_CONDUIT STREQUAL "ucx") OR FF_LEGION_NETWORKS STREQUAL "ucx")
- if("${FF_UCX_URL}" STREQUAL "")
- set(UCX_URL "https://github.com/openucx/ucx/releases/download/v1.14.0-rc1/ucx-1.14.0.tar.gz")
- else()
- set(UCX_URL "${FF_UCX_URL}")
- endif()
-
- set(UCX_DIR ${CMAKE_CURRENT_BINARY_DIR}/ucx)
- get_filename_component(UCX_COMPRESSED_FILE_NAME "${UCX_URL}" NAME)
- # message(STATUS "UCX_URL: ${UCX_URL}")
- # message(STATUS "UCX_COMPRESSED_FILE_NAME: ${UCX_COMPRESSED_FILE_NAME}")
- set(UCX_COMPRESSED_FILE_PATH "${CMAKE_CURRENT_BINARY_DIR}/${UCX_COMPRESSED_FILE_NAME}")
- set(UCX_BUILD_NEEDED OFF)
- set(UCX_CONFIG_FILE ${UCX_DIR}/config.txt)
- set(UCX_BUILD_OUTPUT ${UCX_DIR}/build.log)
-
- if(EXISTS ${UCX_CONFIG_FILE})
- file(READ ${UCX_CONFIG_FILE} PREV_UCX_CONFIG)
- # message(STATUS "PREV_UCX_CONFIG: ${PREV_UCX_CONFIG}")
- if("${UCX_URL}" STREQUAL "${PREV_UCX_CONFIG}")
- # configs match - no build needed
- set(UCX_BUILD_NEEDED OFF)
- else()
- message(STATUS "UCX configuration has changed - rebuilding...")
- set(UCX_BUILD_NEEDED ON)
- endif()
- else()
- message(STATUS "Configuring and building UCX...")
- set(UCX_BUILD_NEEDED ON)
- endif()
-
- if(UCX_BUILD_NEEDED)
- if(NOT EXISTS "${UCX_COMPRESSED_FILE_PATH}")
- message(STATUS "Downloading openucx/ucx from: ${UCX_URL}")
- file(
- DOWNLOAD
- "${UCX_URL}" "${UCX_COMPRESSED_FILE_PATH}"
- SHOW_PROGRESS
- STATUS status
- LOG log
- )
-
- list(GET status 0 status_code)
- list(GET status 1 status_string)
-
- if(status_code EQUAL 0)
- message(STATUS "Downloading... done")
- else()
- message(FATAL_ERROR "error: downloading '${UCX_URL}' failed
- status_code: ${status_code}
- status_string: ${status_string}
- log:
- --- LOG BEGIN ---
- ${log}
- --- LOG END ---"
- )
- endif()
- else()
- message(STATUS "${UCX_COMPRESSED_FILE_NAME} already exists")
- endif()
-
- execute_process(COMMAND mkdir -p ${UCX_DIR})
- execute_process(COMMAND tar xzf ${UCX_COMPRESSED_FILE_PATH} -C ${UCX_DIR} --strip-components 1)
- message(STATUS "Building UCX...")
- execute_process(
- COMMAND sh -c "cd ${UCX_DIR} && ${UCX_DIR}/contrib/configure-release --prefix=${UCX_DIR}/install --enable-mt && make -j8 && make install"
- RESULT_VARIABLE UCX_BUILD_STATUS
- OUTPUT_FILE ${UCX_BUILD_OUTPUT}
- ERROR_FILE ${UCX_BUILD_OUTPUT}
- )
-
- if(UCX_BUILD_STATUS)
- message(FATAL_ERROR "UCX build result = ${UCX_BUILD_STATUS} - see ${UCX_BUILD_OUTPUT} for more details")
- endif()
-
- # Currently, we use default build configurations for UCX and therefore only save URL as configuration settings
- file(WRITE ${UCX_CONFIG_FILE} "${UCX_URL}")
- endif()
-
- if (FF_LEGION_NETWORKS STREQUAL "gasnet" AND FF_GASNET_CONDUIT STREQUAL "ucx")
- set(ENV{UCX_HOME} "${UCX_DIR}/install")
- install(DIRECTORY ${UCX_DIR}/install/bin/ DESTINATION bin)
- install(DIRECTORY ${UCX_DIR}/install/include/ DESTINATION include)
- install(DIRECTORY ${UCX_DIR}/install/lib/ DESTINATION lib)
- install(DIRECTORY ${UCX_DIR}/install/share/ DESTINATION share)
- endif()
-
- if (FF_LEGION_NETWORKS STREQUAL "ucx")
- set(ucx_DIR ${UCX_DIR}/cmake)
- set(ENV{Legion_NETWORKS} "ucx")
- message(STATUS "Legion_NETWORKS: $ENV{Legion_NETWORKS}")
- endif()
-else()
- message(STATUS "FF_GASNET_CONDUIT: ${FF_GASNET_CONDUIT}")
+message(STATUS "FF_LEGION_NETWORKS: ${FF_LEGION_NETWORKS}")
+if (FF_LEGION_NETWORKS STREQUAL "gasnet")
+ message(STATUS "FF_GASNET_CONDUIT: ${FF_GASNET_CONDUIT}")
endif()
set(FF_GPU_BACKENDS cuda hip_cuda hip_rocm intel)
@@ -151,17 +100,18 @@ set_property(CACHE FF_GPU_BACKEND PROPERTY STRINGS ${FF_GPU_BACKENDS})
# option for cuda arch
set(FF_CUDA_ARCH "autodetect" CACHE STRING "Target CUDA Arch")
-if (FF_CUDA_ARCH STREQUAL "")
+if ((FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") AND FF_CUDA_ARCH STREQUAL "")
message(FATAL_ERROR "FF_CUDA_ARCH cannot be an empty string. Set it to `autodetect`, `all`, or pass one or multiple valid CUDA archs.")
endif()
+# option for hip arch
+set(FF_HIP_ARCH "all" CACHE STRING "Target HIP Arch")
+if (FF_GPU_BACKEND STREQUAL "hip_rocm" AND FF_CUDA_ARCH STREQUAL "")
+ message(FATAL_ERROR "FF_HIP_ARCH cannot be an empty string. Set it to `all`, or pass one or multiple valid HIP archs.")
+endif()
# option for nccl
option(FF_USE_NCCL "Run FlexFlow with NCCL" OFF)
-if (FF_GPU_BACKEND STREQUAL "hip_rocm" AND FF_USE_NCCL STREQUAL "ON")
- message(FATAL_ERROR "NCCL: ON for FF_GPU_BACKEND: hip_rocm. hip_rocm backend must have NCCL disabled.")
-endif()
-
# option for avx2
option(FF_USE_AVX2 "Run FlexFlow with AVX2" OFF)
@@ -170,6 +120,7 @@ set(FF_MAX_DIM "4" CACHE STRING "Maximum dimention of tensors")
# option for legion
option(FF_USE_EXTERNAL_LEGION "Use pre-installed Legion" OFF)
+set(LEGION_MAX_RETURN_SIZE "32768" CACHE STRING "Maximum Legion return size")
set(FLEXFLOW_EXT_LIBRARIES "")
set(FLEXFLOW_INCLUDE_DIRS "")
@@ -181,10 +132,10 @@ set(LD_FLAGS $ENV{LD_FLAGS})
# Set global FLAGS
list(APPEND CC_FLAGS
- -std=c++11)
-
+ -std=c++17)
list(APPEND NVCC_FLAGS
- -std=c++11)
+ -std=c++17)
+
add_compile_options(${CC_FLAGS})
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS})
@@ -209,359 +160,442 @@ if(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "hip_rocm")
set(ROCM_PATH "/opt/rocm" CACHE STRING "Default ROCM installation directory.")
endif()
-# ZLIB
-include(zlib)
-
# CUDA
if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
include(cuda)
endif()
+# HIP
+if (FF_GPU_BACKEND STREQUAL "hip_rocm" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
+ enable_language(HIP)
+ include(hip)
+endif()
+
# CUDNN
if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
include(cudnn)
endif()
-# NCCL
-if(FF_USE_NCCL)
- include(nccl)
- list(APPEND FF_CC_FLAGS
- -DFF_USE_NCCL)
- list(APPEND FF_NVCC_FLAGS
- -DFF_USE_NCCL)
-endif()
-
# Legion
include(legion)
-# json
-include(json)
-
-# variant
-include(variant)
-
-# optional
-include(optional)
-
-if (FF_GPU_BACKEND STREQUAL "cuda")
- list(APPEND FF_CC_FLAGS
- -DFF_USE_CUDA)
- list(APPEND FF_NVCC_FLAGS
- -DFF_USE_CUDA)
-elseif (FF_GPU_BACKEND STREQUAL "hip_cuda")
- list(APPEND FF_CC_FLAGS
- -DFF_USE_HIP_CUDA)
- list(APPEND FF_HIPCC_FLAGS
- -DFF_USE_HIP_CUDA)
-elseif (FF_GPU_BACKEND STREQUAL "hip_rocm")
- list(APPEND FF_CC_FLAGS
- -DFF_USE_HIP_ROCM)
- list(APPEND FF_HIPCC_FLAGS
- -DFF_USE_HIP_ROCM)
-else()
-endif()
+# Not build FlexFlow if BUILD_LEGION_ONLY is ON
+if(NOT BUILD_LEGION_ONLY)
+ # NCCL
+ if(FF_USE_NCCL)
+ if(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda")
+ include(nccl)
+ endif()
+ list(APPEND FF_CC_FLAGS
+ -DFF_USE_NCCL)
+ list(APPEND FF_NVCC_FLAGS
+ -DFF_USE_NCCL)
+ endif()
-# Start build FlexFlow
-if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+ # Inference tests
+ if(INFERENCE_TESTS)
list(APPEND FF_CC_FLAGS
- -DFF_DEBUG)
+ -DINFERENCE_TESTS)
list(APPEND FF_NVCC_FLAGS
- -DFF_DEBUG)
-endif()
+ -DINFERENCE_TESTS)
+ endif()
+
+ # json
+ include(json)
+
+ # variant
+ include(variant)
+
+ # optional
+ include(optional)
+
+ if (FF_GPU_BACKEND STREQUAL "cuda")
+ list(APPEND FF_CC_FLAGS
+ -DFF_USE_CUDA)
+ list(APPEND FF_NVCC_FLAGS
+ -DFF_USE_CUDA)
+ elseif (FF_GPU_BACKEND STREQUAL "hip_cuda")
+ list(APPEND FF_CC_FLAGS
+ -DFF_USE_HIP_CUDA)
+ list(APPEND FF_HIPCC_FLAGS
+ -DFF_USE_HIP_CUDA)
+ elseif (FF_GPU_BACKEND STREQUAL "hip_rocm")
+ list(APPEND FF_CC_FLAGS
+ -DFF_USE_HIP_ROCM)
+ list(APPEND FF_HIPCC_FLAGS
+ -DFF_USE_HIP_ROCM)
+ else()
+ endif()
-message(STATUS "FlexFlow MAX_DIM: ${FF_MAX_DIM}")
+ # Start build FlexFlow
+ if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+ list(APPEND FF_CC_FLAGS
+ -DFF_DEBUG)
+ list(APPEND FF_NVCC_FLAGS
+ -DFF_DEBUG)
+ endif()
-list(APPEND FF_CC_FLAGS
- -DMAX_TENSOR_DIM=${FF_MAX_DIM})
+ message(STATUS "FlexFlow MAX_DIM: ${FF_MAX_DIM}")
+ message(STATUS "LEGION_MAX_RETURN_SIZE: ${LEGION_MAX_RETURN_SIZE}")
-if(FF_USE_AVX2)
list(APPEND FF_CC_FLAGS
- -DFF_USE_AVX2
- -mavx2)
-endif()
-
-list(APPEND FF_NVCC_FLAGS
- -Wno-deprecated-gpu-targets
- -DMAX_TENSOR_DIM=${FF_MAX_DIM})
-
-list(APPEND FF_LD_FLAGS
- -lrt
- -ldl
- -rdynamic)
+ -DMAX_TENSOR_DIM=${FF_MAX_DIM}
+ -DLEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE})
-# Set FF FLAGS
-add_compile_options(${FF_CC_FLAGS})
-set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${FF_NVCC_FLAGS} -UNDEBUG)
-link_libraries(${FF_LD_FLAGS})
+ if(FF_USE_AVX2)
+ list(APPEND FF_CC_FLAGS
+ -DFF_USE_AVX2
+ -mavx2)
+ endif()
-list(APPEND FLEXFLOW_INCLUDE_DIRS
- ${FLEXFLOW_ROOT}/include
- ${FLEXFLOW_ROOT})
+ list(APPEND FF_NVCC_FLAGS
+ -Wno-deprecated-gpu-targets
+ -DMAX_TENSOR_DIM=${FF_MAX_DIM}
+ -DLEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE})
+
+ list(APPEND FF_LD_FLAGS
+ -lrt
+ -ldl
+ -rdynamic
+ -lstdc++fs)
+
+ # Set FF FLAGS
+ add_compile_options(${FF_CC_FLAGS})
+ set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${FF_NVCC_FLAGS} -UNDEBUG)
+ link_libraries(${FF_LD_FLAGS})
+
+ list(APPEND FLEXFLOW_INCLUDE_DIRS
+ ${FLEXFLOW_ROOT}/include
+ ${FLEXFLOW_ROOT})
+
+ file(GLOB_RECURSE FLEXFLOW_HDR
+ LIST_DIRECTORIES False
+ ${FLEXFLOW_ROOT}/include/*.h)
+
+ #list(APPEND FLEXFLOW_HDR ${FLEXFLOW_ROOT}/inference/file_loader.h)
-file(GLOB_RECURSE FLEXFLOW_HDR
- LIST_DIRECTORIES False
- ${FLEXFLOW_ROOT}/include/*.h)
+ file(GLOB_RECURSE FLEXFLOW_SRC
+ LIST_DIRECTORIES False
+ ${FLEXFLOW_ROOT}/src/*.cc)
+
+ list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc")
+ #list(APPEND FLEXFLOW_SRC ${FLEXFLOW_ROOT}/inference/file_loader.cc)
-file(GLOB_RECURSE FLEXFLOW_SRC
- LIST_DIRECTORIES False
- ${FLEXFLOW_ROOT}/src/*.cc)
-list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc")
+ set(FLEXFLOW_CPP_DRV_SRC
+ ${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc)
-set(FLEXFLOW_CPP_DRV_SRC
- ${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc)
+ add_library(substitution_loader SHARED
+ ${FLEXFLOW_ROOT}/src/runtime/substitution_loader.cc)
+ target_include_directories(substitution_loader PRIVATE ${FLEXFLOW_INCLUDE_DIRS})
+ target_link_libraries(substitution_loader nlohmann_json::nlohmann_json)
-add_library(substitution_loader SHARED
- ${FLEXFLOW_ROOT}/src/runtime/substitution_loader.cc)
-target_include_directories(substitution_loader PRIVATE ${FLEXFLOW_INCLUDE_DIRS})
-target_link_libraries(substitution_loader nlohmann_json::nlohmann_json)
+ #message("FLEXFLOW_INCLUDE_DIRS: ${FLEXFLOW_INCLUDE_DIRS}")
-#message("FLEXFLOW_INCLUDE_DIRS: ${FLEXFLOW_INCLUDE_DIRS}")
+ # compile flexflow lib
+ if (FF_GPU_BACKEND STREQUAL "cuda")
+ file(GLOB_RECURSE FLEXFLOW_GPU_SRC
+ LIST_DIRECTORIES False
+ ${FLEXFLOW_ROOT}/src/*.cu)
-# compile flexflow lib
-if (FF_GPU_BACKEND STREQUAL "cuda")
- file(GLOB_RECURSE FLEXFLOW_GPU_SRC
- LIST_DIRECTORIES False
- ${FLEXFLOW_ROOT}/src/*.cu)
+ add_compile_definitions(FF_USE_CUDA)
- add_compile_definitions(FF_USE_CUDA)
+ if(BUILD_SHARED_LIBS)
+ cuda_add_library(flexflow SHARED ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC} OPTIONS ${CUDA_GENCODE})
+ else()
+ cuda_add_library(flexflow STATIC ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC} OPTIONS ${CUDA_GENCODE})
+ endif()
+ elseif(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "hip_rocm")
+ file(GLOB_RECURSE FLEXFLOW_GPU_SRC
+ LIST_DIRECTORIES False
+ ${FLEXFLOW_ROOT}/src/*.cpp)
+
+ set_source_files_properties(${FLEXFLOW_GPU_SRC} PROPERTIES LANGUAGE HIP)
+ set_source_files_properties(${FLEXFLOW_SRC} PROPERTIES LANGUAGE HIP)
+
+ if(BUILD_SHARED_LIBS)
+ add_library(flexflow SHARED ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC})
+ else()
+ add_library(flexflow STATIC ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC})
+ endif()
- if(BUILD_SHARED_LIBS)
- cuda_add_library(flexflow SHARED ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC} OPTIONS ${CUDA_GENCODE})
+ list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}/hip ${ROCM_PATH})
+
+ find_package(hip REQUIRED)
+
+ if (FF_GPU_BACKEND STREQUAL "hip_cuda")
+ # The targets defined by the hip cmake config only target amd devices.
+ # For targeting nvidia devices, we'll make our own interface target,
+ # hip_device_nvidia, that includes the rocm and hip headers.
+ add_library(hip_device_nvidia INTERFACE)
+
+ if (NOT FF_CUDA_ARCH STREQUAL "")
+ target_compile_options(hip_device_nvidia INTERFACE -arch=compute_${FF_CUDA_ARCH})
+ endif()
+
+ target_include_directories(hip_device_nvidia SYSTEM INTERFACE ${HIP_INCLUDE_DIRS} ${ROCM_PATH}/include)
+ target_include_directories(hip_device_nvidia INTERFACE ${HIP_INCLUDE_DIRS} ${ROCM_PATH}/include)
+
+ add_compile_definitions(FF_USE_HIP_CUDA)
+
+ # Linking cuda:
+ # We do not explicitly link cuda. hipcc when targeting nvidia will
+ # use nvcc under the hood. nvcc when used for linking will handle
+ # linking cuda dependencies
+ target_link_libraries(flexflow hip_device_nvidia)
+ elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
+ find_package(hipblas REQUIRED)
+ find_package(miopen REQUIRED)
+ if(FF_USE_NCCL)
+ find_package(rccl REQUIRED)
+ endif()
+ # find_package(rocrand REQUIRED)
+ find_library(HIP_RAND_LIBRARY hiprand REQUIRED)
+
+ add_compile_definitions(FF_USE_HIP_ROCM)
+
+ if (FF_HIP_ARCH STREQUAL "")
+ message(FATAL_ERROR "FF_HIP_ARCH is undefined")
+ endif()
+ set_property(TARGET flexflow PROPERTY HIP_ARCHITECTURES "${HIP_ARCH_LIST}")
+
+ message(STATUS "FF_GPU_BACKEND: ${FF_GPU_BACKEND}")
+ message(STATUS "FF_HIP_ARCH: ${FF_HIP_ARCH}")
+ message(STATUS "HIP_ARCH_LIST: ${HIP_ARCH_LIST}")
+ get_property(CHECK_HIP_ARCHS TARGET flexflow PROPERTY HIP_ARCHITECTURES)
+ message(STATUS "CHECK_HIP_ARCHS: ${CHECK_HIP_ARCHS}")
+ message(STATUS "HIP_CLANG_PATH: ${HIP_CLANG_PATH}")
+
+ # The hip cmake config module defines three targets,
+ # hip::amdhip64, hip::host, and hip::device.
+ #
+ # hip::host and hip::device are interface targets. hip::amdhip64 is an
+ # imported target for libamdhip.
+ #
+ # You do not directly link to hip::amdhip64. hip::host links to hip::amdhip64
+ # and hip::device links to hip::host. Link to hip::host to just use hip without
+ # compiling any GPU code. Link to hip::device to compile the GPU device code.
+ #
+ # Docs (outdated):
+ # https://rocmdocs.amd.com/en/latest/Installation_Guide/Using-CMake-with-AMD-ROCm.html
+ target_link_libraries(flexflow hip::device roc::hipblas MIOpen ${HIP_RAND_LIBRARY})
+ if(FF_USE_NCCL)
+ target_link_libraries(flexflow rccl)
+ endif()
+ endif()
else()
- cuda_add_library(flexflow STATIC ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC} OPTIONS ${CUDA_GENCODE})
+ message(FATAL_ERROR "Unsupported FF_GPU_BACKEND for cmake: ${FF_GPU_BACKEND}")
endif()
-elseif(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "hip_rocm")
- file(GLOB_RECURSE FLEXFLOW_GPU_SRC
- LIST_DIRECTORIES False
- ${FLEXFLOW_ROOT}/src/*.cpp)
- if(BUILD_SHARED_LIBS)
- add_library(flexflow SHARED ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC})
- else()
- add_library(flexflow STATIC ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC})
+ if(FF_USE_NCCL AND (FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda"))
+ add_dependencies(flexflow ${NCCL_NAME})
endif()
- list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}/hip ${ROCM_PATH})
-
- find_package(hip REQUIRED)
-
- if (FF_GPU_BACKEND STREQUAL "hip_cuda")
- # The targets defined by the hip cmake config only target amd devices.
- # For targeting nvidia devices, we'll make our own interface target,
- # hip_device_nvidia, that includes the rocm and hip headers.
- add_library(hip_device_nvidia INTERFACE)
-
- if (NOT FF_CUDA_ARCH STREQUAL "")
- target_compile_options(hip_device_nvidia INTERFACE -arch=compute_${FF_CUDA_ARCH})
- endif()
-
- target_include_directories(hip_device_nvidia SYSTEM INTERFACE ${HIP_INCLUDE_DIRS} ${ROCM_PATH}/include)
- target_include_directories(hip_device_nvidia INTERFACE ${HIP_INCLUDE_DIRS} ${ROCM_PATH}/include)
-
- add_compile_definitions(FF_USE_HIP_CUDA)
-
- # Linking cuda:
- # We do not explicitly link cuda. hipcc when targeting nvidia will
- # use nvcc under the hood. nvcc when used for linking will handle
- # linking cuda dependencies
- target_link_libraries(flexflow hip_device_nvidia)
- elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
- find_package(hipblas REQUIRED)
- find_package(miopen REQUIRED)
- # find_package(rocrand REQUIRED)
- find_library(HIP_RAND_LIBRARY hiprand REQUIRED)
-
- add_compile_definitions(FF_USE_HIP_ROCM)
-
- # The hip cmake config module defines three targets,
- # hip::amdhip64, hip::host, and hip::device.
- #
- # hip::host and hip::device are interface targets. hip::amdhip64 is an
- # imported target for libamdhip.
- #
- # You do not directly link to hip::amdhip64. hip::host links to hip::amdhip64
- # and hip::device links to hip::host. Link to hip::host to just use hip without
- # compiling any GPU code. Link to hip::device to compile the GPU device code.
- #
- # Docs (outdated):
- # https://rocmdocs.amd.com/en/latest/Installation_Guide/Using-CMake-with-AMD-ROCm.html
- target_link_libraries(flexflow hip::device roc::hipblas MIOpen ${HIP_RAND_LIBRARY})
+ target_include_directories(flexflow PUBLIC ${FLEXFLOW_INCLUDE_DIRS})
+ # LEGION_URL is defined if we found a precompiled Legion library to download
+ if(LEGION_URL)
+ # Legion builds produce two library files: one for the Legion runtime and one for the Realm runtime.
+ # When linking FlexFlow to a precompiled version of Legion, we need to manually link to both library files.
+ target_link_libraries(flexflow ${LEGION_LIBRARY} ${REALM_LIBRARY} ${FLEXFLOW_EXT_LIBRARIES} nlohmann_json::nlohmann_json mpark_variant optional)
+ add_dependencies(flexflow ${LEGION_NAME})
+ else()
+ # When building Legion from source, we do so by calling add_subdirectory(), and obtain a library with both the
+ # Legion and Realm runtimes. The library's name is saved into the LEGION_LIBRARY variable. Hence, we only need
+ # to link FlexFlow to ${LEGION_LIBRARY}
+ target_link_libraries(flexflow ${LEGION_LIBRARY} ${FLEXFLOW_EXT_LIBRARIES} nlohmann_json::nlohmann_json mpark_variant optional)
endif()
-else()
- message(FATAL_ERROR "Unsupported FF_GPU_BACKEND for cmake: ${FF_GPU_BACKEND}")
-endif()
-
-if(FF_USE_NCCL)
- add_dependencies(flexflow ${NCCL_NAME})
-endif()
-
-target_include_directories(flexflow PUBLIC ${FLEXFLOW_INCLUDE_DIRS})
-# LEGION_URL is defined if we found a precompiled Legion library to download
-if(LEGION_URL)
- # Legion builds produce two library files: one for the Legion runtime and one for the Realm runtime.
- # When linking FlexFlow to a precompiled version of Legion, we need to manually link to both library files.
- target_link_libraries(flexflow ${LEGION_LIBRARY} ${REALM_LIBRARY} ${FLEXFLOW_EXT_LIBRARIES} nlohmann_json::nlohmann_json mpark_variant optional)
- add_dependencies(flexflow ${LEGION_NAME})
-else()
- # When building Legion from source, we do so by calling add_subdirectory(), and obtain a library with both the
- # Legion and Realm runtimes. The library's name is saved into the LEGION_LIBRARY variable. Hence, we only need
- # to link FlexFlow to ${LEGION_LIBRARY}
- target_link_libraries(flexflow ${LEGION_LIBRARY} ${FLEXFLOW_EXT_LIBRARIES} nlohmann_json::nlohmann_json mpark_variant optional)
-endif()
-#library api version, bump from time to time
-set(SOVERSION 1)
-
-set_target_properties(flexflow PROPERTIES POSITION_INDEPENDENT_CODE ON)
-set_target_properties(flexflow PROPERTIES OUTPUT_NAME "flexflow${INSTALL_SUFFIX}")
-set_target_properties(flexflow PROPERTIES SOVERSION ${SOVERSION})
-if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
- set_target_properties(flexflow PROPERTIES BUILD_RPATH "\$ORIGIN")
- set_target_properties(flexflow PROPERTIES INSTALL_RPATH "\$ORIGIN")
-elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin")
- set_target_properties(flexflow PROPERTIES BUILD_RPATH "@loader_path")
- set_target_properties(flexflow PROPERTIES INSTALL_RPATH "@loader_path")
-endif()
+ #library api version, bump from time to time
+ set(SOVERSION 1)
+
+ set_target_properties(flexflow PROPERTIES POSITION_INDEPENDENT_CODE ON)
+ set_target_properties(flexflow PROPERTIES OUTPUT_NAME "flexflow${INSTALL_SUFFIX}")
+ set_target_properties(flexflow PROPERTIES SOVERSION ${SOVERSION})
+ if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
+ set_target_properties(flexflow PROPERTIES BUILD_RPATH "\$ORIGIN")
+ set_target_properties(flexflow PROPERTIES INSTALL_RPATH "\$ORIGIN")
+ elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+ set_target_properties(flexflow PROPERTIES BUILD_RPATH "@loader_path")
+ set_target_properties(flexflow PROPERTIES INSTALL_RPATH "@loader_path")
+ endif()
-# python related
-if (FF_USE_PYTHON)
- # create flexflow_cffi_header.py
- add_custom_command(TARGET flexflow
- PRE_BUILD
- COMMAND ${FLEXFLOW_ROOT}/python/flexflow_cffi_build.py --ffhome-dir ${FLEXFLOW_ROOT} --output-dir ${FLEXFLOW_ROOT}/python/flexflow/core
- WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
- COMMENT "Creating flexflow_cffi_header.py..."
- )
- if (NOT FF_BUILD_FROM_PYPI)
- # generate the Legion Python bindings library. When building from pip, we need to do this post-install to prevent Legion from overwriting the path to the Legion shared library
- add_custom_command(TARGET flexflow
- POST_BUILD
- COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS}
- WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python
- )
- # create flexflow_python interpreter. When building from pip, we install the FF_HOME/python/flexflow_python script instead.
+ # python related
+ if (FF_USE_PYTHON)
+ find_package(Python COMPONENTS Interpreter Development)
+ # create flexflow_cffi_header.py
add_custom_command(TARGET flexflow
PRE_BUILD
- COMMAND ${PYTHON_EXECUTABLE} ${FLEXFLOW_ROOT}/python/flexflow_python_build.py --build-dir ${CMAKE_BINARY_DIR}
+ COMMAND ${FLEXFLOW_ROOT}/python/flexflow_cffi_build.py --ffhome-dir ${FLEXFLOW_ROOT} --output-dir ${FLEXFLOW_ROOT}/python/flexflow/core
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
- COMMENT "Creating flexflow_python interpreter..."
+ COMMENT "Creating flexflow_cffi_header.py..."
)
- install(PROGRAMS ${CMAKE_BINARY_DIR}/flexflow_python DESTINATION "bin")
+ if (NOT FF_BUILD_FROM_PYPI)
+ # generate the Legion Python bindings library. When building from pip, we need to do this post-install to prevent Legion from overwriting the path to the Legion shared library
+ add_custom_command(TARGET flexflow
+ POST_BUILD
+ COMMAND CMAKE_BUILD_DIR=${Legion_BINARY_DIR}/runtime CMAKE_INSTALL_PREFIX=${Legion_BINARY_DIR} ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS}
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python
+ )
+ # create flexflow_python interpreter. When building from pip, we install the FF_HOME/python/flexflow_python script instead.
+ add_custom_command(TARGET flexflow
+ PRE_BUILD
+ COMMAND ${Python_EXECUTABLE} ${FLEXFLOW_ROOT}/python/flexflow_python_build.py --build-dir ${CMAKE_BINARY_DIR}
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+ COMMENT "Creating flexflow_python interpreter..."
+ )
+ install(PROGRAMS ${CMAKE_BINARY_DIR}/flexflow_python DESTINATION "bin")
+ endif()
+ endif()
+
+ if (INFERENCE_TESTS)
+ target_link_libraries(flexflow "${TORCH_LIBRARIES}")
+ set_property(TARGET flexflow PROPERTY CXX_STANDARD 14)
endif()
-endif()
-# build binary
-option(FF_BUILD_RESNET "build resnet example" OFF)
-option(FF_BUILD_RESNEXT "build resnext example" OFF)
-option(FF_BUILD_ALEXNET "build alexnet example" OFF)
-option(FF_BUILD_DLRM "build DLRM example" OFF)
-option(FF_BUILD_XDL "build XDL example" OFF)
-option(FF_BUILD_INCEPTION "build inception example" OFF)
-option(FF_BUILD_CANDLE_UNO "build candle uno example" OFF)
-option(FF_BUILD_TRANSFORMER "build transformer example" OFF)
-option(FF_BUILD_MOE "build mixture of experts example" OFF)
-option(FF_BUILD_MLP_UNIFY "build mlp unify example" OFF)
-option(FF_BUILD_SPLIT_TEST "build split test example" OFF)
-option(FF_BUILD_SPLIT_TEST_2 "build split test 2 example" OFF)
-option(FF_BUILD_ALL_EXAMPLES "build all examples. Overrides others" OFF)
-option(FF_BUILD_UNIT_TESTS "build non-operator unit tests" OFF)
-option(FF_BUILD_SUBSTITUTION_TOOL "build substitution conversion tool" OFF)
-option(FF_BUILD_VISUALIZATION_TOOL "build substitution visualization tool" OFF)
-
-if(FF_BUILD_UNIT_TESTS)
- set(BUILD_GMOCK OFF)
- add_subdirectory(deps/googletest)
- enable_testing()
- add_subdirectory(tests/unit)
-endif()
+ # build binary
+ option(FF_BUILD_TOKENIZER "build tokenizer=cpp for LLM serving" OFF)
+ option(FF_BUILD_RESNET "build resnet example" OFF)
+ option(FF_BUILD_RESNEXT "build resnext example" OFF)
+ option(FF_BUILD_ALEXNET "build alexnet example" OFF)
+ option(FF_BUILD_DLRM "build DLRM example" OFF)
+ option(FF_BUILD_XDL "build XDL example" OFF)
+ option(FF_BUILD_INCEPTION "build inception example" OFF)
+ option(FF_BUILD_CANDLE_UNO "build candle uno example" OFF)
+ option(FF_BUILD_TRANSFORMER "build transformer example" OFF)
+ option(FF_BUILD_MOE "build mixture of experts example" OFF)
+ option(FF_BUILD_MLP_UNIFY "build mlp unify example" OFF)
+ option(FF_BUILD_SPLIT_TEST "build split test example" OFF)
+ option(FF_BUILD_SPLIT_TEST_2 "build split test 2 example" OFF)
+ option(FF_BUILD_MLP_UNIFY_INFERENCE "build mlp unify inference example" OFF)
+ option(FF_BUILD_ALL_INFERENCE_EXAMPLES "build all inference examples. Overrides others" OFF)
+ option(FF_BUILD_ALL_EXAMPLES "build all examples. Overrides others" OFF)
+ option(FF_BUILD_UNIT_TESTS "build non-operator unit tests" OFF)
+ option(FF_BUILD_SUBSTITUTION_TOOL "build substitution conversion tool" OFF)
+ option(FF_BUILD_VISUALIZATION_TOOL "build substitution visualization tool" OFF)
+
+ if(FF_BUILD_UNIT_TESTS)
+ set(BUILD_GMOCK OFF)
+ add_subdirectory(deps/googletest)
+ enable_testing()
+ add_subdirectory(tests/unit)
+ endif()
-if(FF_BUILD_SUBSTITUTION_TOOL)
- add_subdirectory(tools/protobuf_to_json)
-endif()
+ if(FF_BUILD_SUBSTITUTION_TOOL)
+ add_subdirectory(tools/protobuf_to_json)
+ endif()
-if(FF_BUILD_VISUALIZATION_TOOL)
- add_subdirectory(tools/substitutions_to_dot)
-endif()
+ if(FF_BUILD_VISUALIZATION_TOOL)
+ add_subdirectory(tools/substitutions_to_dot)
+ endif()
-if(FF_BUILD_RESNET OR FF_BUILD_ALL_EXAMPLES)
- add_subdirectory(examples/cpp/ResNet)
-endif()
+ if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_TOKENIZER)
+ # Ensure Rust is installed
+ execute_process(COMMAND rustc --version
+ RESULT_VARIABLE RUST_COMMAND_RESULT
+ OUTPUT_VARIABLE RUSTC_OUTPUT
+ ERROR_QUIET)
+ if(NOT RUST_COMMAND_RESULT EQUAL 0)
+ message(FATAL_ERROR "Rust is not installed on the system. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.")
+ endif()
+ # Ensure Cargo is installed
+ execute_process(COMMAND cargo --version
+ RESULT_VARIABLE CARGO_RESULT
+ OUTPUT_QUIET ERROR_QUIET)
+ if(NOT CARGO_RESULT EQUAL 0)
+ message(FATAL_ERROR "Rust is installed, but cargo is not. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.")
+ endif()
+ set(MLC_ENABLE_SENTENCEPIECE_TOKENIZER ON)
+ add_subdirectory(deps/tokenizers-cpp tokenizers EXCLUDE_FROM_ALL)
+ target_include_directories(flexflow PUBLIC deps/tokenizers-cpp/include)
+ target_link_libraries(flexflow tokenizers_cpp)
+ endif()
+ if(FF_BUILD_RESNET OR FF_BUILD_ALL_EXAMPLES)
+ add_subdirectory(examples/cpp/ResNet)
+ endif()
-if(FF_BUILD_RESNEXT OR FF_BUILD_ALL_EXAMPLES)
- add_subdirectory(examples/cpp/resnext50)
-endif()
+ if(FF_BUILD_RESNEXT OR FF_BUILD_ALL_EXAMPLES)
+ add_subdirectory(examples/cpp/resnext50)
+ endif()
-if(FF_BUILD_ALEXNET OR FF_BUILD_ALL_EXAMPLES)
- add_subdirectory(examples/cpp/AlexNet)
-endif()
+ if(FF_BUILD_ALEXNET OR FF_BUILD_ALL_EXAMPLES)
+ add_subdirectory(examples/cpp/AlexNet)
+ endif()
-if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_EXAMPLES)
- add_subdirectory(examples/cpp/MLP_Unify)
-endif()
+ if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_EXAMPLES)
+ add_subdirectory(examples/cpp/MLP_Unify)
+ endif()
-if(FF_BUILD_SPLIT_TEST OR FF_BUILD_ALL_EXAMPLES)
- add_subdirectory(examples/cpp/split_test)
-endif()
+ if(FF_BUILD_SPLIT_TEST OR FF_BUILD_ALL_EXAMPLES)
+ add_subdirectory(examples/cpp/split_test)
+ endif()
-if(FF_BUILD_SPLIT_TEST_2 OR FF_BUILD_ALL_EXAMPLES)
- add_subdirectory(examples/cpp/split_test_2)
-endif()
+ if(FF_BUILD_SPLIT_TEST_2 OR FF_BUILD_ALL_EXAMPLES)
+ add_subdirectory(examples/cpp/split_test_2)
+ endif()
-if(FF_BUILD_INCEPTION OR FF_BUILD_ALL_EXAMPLES)
- add_subdirectory(examples/cpp/InceptionV3)
-endif()
+ if(FF_BUILD_INCEPTION OR FF_BUILD_ALL_EXAMPLES)
+ add_subdirectory(examples/cpp/InceptionV3)
+ endif()
-#TODO: Once functional add to BUILD_ALL_EXAMPLES
-if(FF_BUILD_CANDLE_UNO OR FF_BUILD_ALL_EXAMPLES)
- add_subdirectory(examples/cpp/candle_uno)
-endif()
+ #TODO: Once functional add to BUILD_ALL_EXAMPLES
+ if(FF_BUILD_CANDLE_UNO OR FF_BUILD_ALL_EXAMPLES)
+ add_subdirectory(examples/cpp/candle_uno)
+ endif()
-if(FF_BUILD_DLRM OR FF_BUILD_ALL_EXAMPLES)
- add_subdirectory(examples/cpp/DLRM)
+ if(FF_BUILD_DLRM OR FF_BUILD_ALL_EXAMPLES)
+ add_subdirectory(examples/cpp/DLRM)
- #add_executable(generate_dlrm_hetero_strategy src/runtime/dlrm_strategy_hetero.cc)
- #target_include_directories(generate_dlrm_hetero_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS})
+ #add_executable(generate_dlrm_hetero_strategy src/runtime/dlrm_strategy_hetero.cc)
+ #target_include_directories(generate_dlrm_hetero_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS})
- #add_executable(generate_dlrm_strategy src/runtime/dlrm_strategy.cc)
- #target_include_directories(generate_dlrm_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS})
-endif()
+ #add_executable(generate_dlrm_strategy src/runtime/dlrm_strategy.cc)
+ #target_include_directories(generate_dlrm_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS})
+ endif()
-if(FF_BUILD_XDL OR FF_BUILD_ALL_EXAMPLES)
- add_subdirectory(examples/cpp/XDL)
-endif()
+ if(FF_BUILD_XDL OR FF_BUILD_ALL_EXAMPLES)
+ add_subdirectory(examples/cpp/XDL)
+ endif()
-if(FF_BUILD_TRANSFORMER OR FF_BUILD_ALL_EXAMPLES)
- add_subdirectory(examples/cpp/Transformer)
-endif()
+ if(FF_BUILD_TRANSFORMER OR FF_BUILD_ALL_EXAMPLES)
+ add_subdirectory(examples/cpp/Transformer)
+ endif()
-if(FF_BUILD_MOE OR FF_BUILD_ALL_EXAMPLES)
- add_subdirectory(examples/cpp/mixture_of_experts)
-endif()
+ if(FF_BUILD_MOE OR FF_BUILD_ALL_EXAMPLES)
+ add_subdirectory(examples/cpp/mixture_of_experts)
+ endif()
-# installation
-set(INCLUDE_DEST "include")
-set(LIB_DEST "lib")
-install(FILES ${FLEXFLOW_HDR} DESTINATION ${INCLUDE_DEST})
-install(TARGETS flexflow DESTINATION ${LIB_DEST})
-# install python
-if (FF_USE_PYTHON)
- execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "from distutils import sysconfig; print(sysconfig.get_python_lib(plat_specific=False,standard_lib=False))" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)
- if (NOT FF_BUILD_FROM_PYPI)
- install(
- DIRECTORY ${FLEXFLOW_ROOT}/python/flexflow/
- DESTINATION ${PY_DEST}/flexflow
- FILES_MATCHING
- PATTERN "*.py")
- else()
- # pip automatically installs all *.py files in the python/flexflow folder, but because flexflow_cffi_header.py is generated at build time, we have to install it manually.
- install(
- PROGRAMS ${FLEXFLOW_ROOT}/python/flexflow/core/flexflow_cffi_header.py
- DESTINATION ${PY_DEST}/flexflow/core
- )
- # Use setup.py script to re-install the Python bindings library with the right library paths.
- # Need to put the instructions in a subfolder because of issue below:
- # https://stackoverflow.com/questions/43875499/do-post-processing-after-make-install-in-cmake
- add_subdirectory(cmake/pip_install)
+ if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES)
+ add_subdirectory(inference/spec_infer)
+ add_subdirectory(inference/incr_decoding)
+ add_subdirectory(inference/peft)
endif()
-endif()
+
+
+ # installation
+ set(INCLUDE_DEST "include")
+ set(LIB_DEST "lib")
+ install(FILES ${FLEXFLOW_HDR} DESTINATION ${INCLUDE_DEST})
+ install(TARGETS flexflow DESTINATION ${LIB_DEST})
+ # install python
+ if (FF_USE_PYTHON)
+ find_package(Python COMPONENTS Interpreter Development)
+ execute_process(COMMAND ${Python_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)
+ if (NOT FF_BUILD_FROM_PYPI)
+ install(
+ DIRECTORY ${FLEXFLOW_ROOT}/python/flexflow/
+ DESTINATION ${PY_DEST}/flexflow
+ FILES_MATCHING
+ PATTERN "*.py")
+ else()
+ # pip automatically installs all *.py files in the python/flexflow folder, but because flexflow_cffi_header.py is generated at build time, we have to install it manually.
+ install(
+ PROGRAMS ${FLEXFLOW_ROOT}/python/flexflow/core/flexflow_cffi_header.py
+ DESTINATION ${PY_DEST}/flexflow/core
+ )
+ # Use setup.py script to re-install the Python bindings library with the right library paths.
+ # Need to put the instructions in a subfolder because of issue below:
+ # https://stackoverflow.com/questions/43875499/do-post-processing-after-make-install-in-cmake
+ add_subdirectory(cmake/pip_install)
+ endif()
+ endif()
+endif() # if(NOT BUILD_LEGION_ONLY)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index e607fddb1a..c3c0b5173f 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -119,7 +119,26 @@ After adding the DNN layers, the next step before compiling the model for traini
#### Model compilation
-TODO
+Model compilation consists of the following steps:
+
+1. We initialize an operator for each layer in the model, via the function `create_operators_from_layers()`. Layers work with `Tensor` input/weights/outputs, and are created directly by the user when writing a FlexFlow program. Operators work with `ParallelTensor` objects and they are responsible for running computations by launching kernels on GPUs.
+2. Launch the graph optimize task (`GRAPH_OPTIMIZE_TASK_ID`), implemented by`PCG::Graph::graph_optimize_task`, which returns `PCG::GraphOptimalViewSerialized`
+ 1. call `deserialize_graph_optimal_view(...)` to get `PCG::Graph *best_graph` and `std::unordered_map optimal_views` from deserialized `PCG::GraphOptimalViewSerialized`
+ 2. `convert_graph_to_operators()`
+ 3. print the dot of the best graph obtained
+ 4. map inputs to parallel tensor and weights to parallel tensor? -> strange for loop to understand better
+3. Init performance metrics via the `FFModel::update_metrics_task`
+4. Perform inplace optimizations (if enabled)
+5. Loop through the operators to do the following (to be understood better):
+ 1. `parameters.push_back(op->weights[i]);` for each weight in each operator
+ 2. `op->map_output_tensors(*this);`
+ 3. `((ParallelOp *)op)->create_input_partition(*this);` if the operator is a parallel operator
+6. Check correctness of the operator's input and output tensors' settings
+7. Perform fusion optimizations, if enabled
+8. Print all operators and their input and output regions
+9. Create the tensor for the label
+10. Initialize the optimizer
+11. In training mode, if NCCL is enabled, initialize all the communicators and other objects
## Continuous Integration
@@ -281,6 +300,10 @@ We want to make contributing to this project as easy and transparent as possible
### Formatting
We use `clang-format` to format our C++ code. If you make changes to the code and the Clang format CI test is failing, you can lint your code by running: `./scripts/format.sh` from the main folder of this repo.
+### Documenting the code
+We follow the Python Docstring conventions for documenting the Python code. We document the C++ code using comments in any of the conventioned supported by Doxygen [see here](https://doxygen.nl/manual/docblocks.html).
+
+
### Pull Requests
We actively welcome your pull requests.
diff --git a/FlexFlow.mk b/FlexFlow.mk
index b434045893..14f32a7639 100644
--- a/FlexFlow.mk
+++ b/FlexFlow.mk
@@ -59,7 +59,8 @@ GEN_SRC += $(shell find $(FF_HOME)/src/loss_functions/ -name '*.cc')\
$(shell find $(FF_HOME)/src/runtime/ -name '*.cc')\
$(shell find $(FF_HOME)/src/utils/dot/ -name '*.cc')\
$(shell find $(FF_HOME)/src/dataloader/ -name '*.cc')\
- $(shell find $(FF_HOME)/src/c/ -name '*.cc')
+ $(shell find $(FF_HOME)/src/c/ -name '*.cc')\
+ $(shell find $(FF_HOME)/inference/ -name 'file_loader.cc')
GEN_SRC := $(filter-out $(FF_HOME)/src/runtime/cpp_driver.cc, $(GEN_SRC))
FF_CUDA_SRC += $(shell find $(FF_HOME)/src/loss_functions/ -name '*.cu')\
@@ -94,15 +95,17 @@ ifneq ($(strip $(FF_USE_PYTHON)), 1)
endif
-INC_FLAGS += -I${FF_HOME}/include -I${FF_HOME}/deps/optional/include -I${FF_HOME}/deps/variant/include -I${FF_HOME}/deps/json/include
+INC_FLAGS += -I${FF_HOME}/include -I${FF_HOME}/inference -I${FF_HOME}/deps/optional/include -I${FF_HOME}/deps/variant/include -I${FF_HOME}/deps/json/include -I${FF_HOME}/deps/tokenizers-cpp/include -I${FF_HOME}/deps/tokenizers-cpp/sentencepiece/src
CC_FLAGS += -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768
NVCC_FLAGS += -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768
HIPCC_FLAGS += -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768
GASNET_FLAGS +=
# For Point and Rect typedefs
-CC_FLAGS += -std=c++11
-NVCC_FLAGS += -std=c++11
-HIPCC_FLAGS += -std=c++11
+CC_FLAGS += -std=c++17
+NVCC_FLAGS += -std=c++17
+HIPCC_FLAGS += -std=c++17
+
+LD_FLAGS += -L$(FF_HOME)/deps/tokenizers-cpp/example/tokenizers -ltokenizers_cpp -ltokenizers_c -L$(FF_HOME)/deps/tokenizers-cpp/example/tokenizers/sentencepiece/src -lsentencepiece
ifeq ($(strip $(FF_USE_NCCL)), 1)
INC_FLAGS += -I$(MPI_HOME)/include -I$(NCCL_HOME)/include
diff --git a/INSTALL.md b/INSTALL.md
index d2e3c1d2f6..1734319540 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -1,4 +1,4 @@
-# Installing FlexFlow
+# Building from source
To build and install FlexFlow, follow the instructions below.
## 1. Download the source code
@@ -30,7 +30,7 @@ If you are planning to build the Python interface, you will need to install seve
The `conda` environment can be created and activated as:
```
-conda env create -f conda/environment.yml
+conda env create -f conda/flexflow.yml
conda activate flexflow
```
@@ -42,7 +42,7 @@ You can configure a FlexFlow build by running the `config/config.linux` file in
3. `FF_CUDA_ARCH` is used to set the architecture of targeted GPUs, for example, the value can be 60 if the GPU architecture is Pascal. To build for more than one architecture, pass a list of comma separated values (e.g. `FF_CUDA_ARCH=70,75`). To compile FlexFlow for all GPU architectures that are detected on the machine, pass `FF_CUDA_ARCH=autodetect` (this is the default value, so you can also leave `FF_CUDA_ARCH` unset. If you want to build for all GPU architectures compatible with FlexFlow, pass `FF_CUDA_ARCH=all`. **If your machine does not have any GPU, you have to set FF_CUDA_ARCH to at least one valid architecture code (or `all`)**, since the compiler won't be able to detect the architecture(s) automatically.
4. `FF_USE_PYTHON` controls whether to build the FlexFlow Python interface.
5. `FF_USE_NCCL` controls whether to build FlexFlow with NCCL support. By default, it is set to ON.
-6. `FF_LEGION_NETWORKS` is used to enable distributed run of FlexFlow. If you want to run FlexFlow on multiple nodes, follow instructions in [MULTI-NODE.md](MULTI-NODE.md) and set the corresponding parameters as follows:
+6. `FF_LEGION_NETWORKS` is used to enable distributed run of FlexFlow. If you want to run FlexFlow on multiple nodes, follow instructions in the [Multinode tutorial](https://flexflow.readthedocs.io/en/latest/multinode.html) and set the corresponding parameters as follows:
* To build FlexFlow with GASNet, set `FF_LEGION_NETWORKS=gasnet` and `FF_GASNET_CONDUIT` as a specific conduit (e.g. `ibv`, `mpi`, `udp`, `ucx`) in `config/config.linux` when configuring the FlexFlow build. Set `FF_UCX_URL` when you want to customize the URL to download UCX.
* To build FlexFlow with native UCX, set `FF_LEGION_NETWORKS=ucx` in `config/config.linux` when configuring the FlexFlow build. Set `FF_UCX_URL` when you want to customize the URL to download UCX.
8. `FF_BUILD_EXAMPLES` controls whether to build all C++ example programs.
@@ -85,10 +85,11 @@ export FF_HOME=/path/to/FlexFlow
### Run FlexFlow Python examples
The Python examples are in the [examples/python](https://github.com/flexflow/FlexFlow/tree/master/examples/python). The native, Keras integration and PyTorch integration examples are listed in `native`, `keras` and `pytorch` respectively.
-To run the Python examples, you have two options: you can use the `flexflow_python` interpreter, available in the `build` folder, or you can use the native Python interpreter. If you choose to use the native Python interpreter, you should either install FlexFlow, or, if you prefer to build without installing, export the following flags:
+To run the Python examples, you have two options: you can use the `flexflow_python` interpreter, available in the `build` folder, or you can use the native Python interpreter. If you choose to use the native Python interpreter, you should either install FlexFlow, or, if you prefer to build without installing, export the required environment flags by running the following command (edit the path if your build folder is not named `build`):
-* `export PYTHONPATH="${FF_HOME}/python:${FF_HOME}/build/deps/legion/bindings/python:${PYTHONPATH}"`
-* `export LD_LIBRARY_PATH="${FF_HOME}/build:${FF_HOME}/build/deps/legion/lib:${LD_LIBRARY_PATH}"`
+```
+source ./build/set_python_envs.sh
+```
**We recommend that you run the** `mnist_mlp` **test under** `native` **using the following cmd to check if FlexFlow has been installed correctly:**
@@ -96,7 +97,7 @@ To run the Python examples, you have two options: you can use the `flexflow_pyth
cd "$FF_HOME"
./python/flexflow_python examples/python/native/mnist_mlp.py -ll:py 1 -ll:gpu 1 -ll:fsize -ll:zsize
```
-A script to run all the Python examples is available at `tests/multi_gpu_tests.sh`
+A script to run all the Python examples is available at `tests/training_tests.sh`
### Run FlexFlow C++ examples
diff --git a/MULTI-NODE.md b/MULTI-NODE.md
index a8fd2fb705..28f2eab8ed 100644
--- a/MULTI-NODE.md
+++ b/MULTI-NODE.md
@@ -17,15 +17,33 @@ Source: Custom (use the security group ID)
You can also use your own GPU cluster, as long as all machines are interconnected with a low-latency network.
-## 2. Configure and build FlexFlow
+## 2. Configure and build UCX
-Follow steps 1 to 5 in [INSTALL.md](INSTALL.md) to download the source code, install system dependencies, install the Python dependencies, configure the FlexFlow build, and build FlexFlow **on each instance at the same path**.
+Find the latest source code release for UCX at https://github.com/openucx/ucx/releases. As of writing this documentation, the latest UCX was 1.15.0 at https://github.com/openucx/ucx/releases/download/v1.15.0/ucx-1.15.0.tar.gz. Extract it and switch to the directory with UCX source code, and run:
+
+```
+CUDA_PATH=/usr/local/cuda
+PREFIX=$PWD/install
+./contrib/configure-release-mt --prefix="$PREFIX" --without-go --enable-mt --with-cuda="$CUDA_PATH"
+make -j install
+echo "$PREFIX"
+```
+
+Replace `{{ CUDA_PATH }}` with the path of your CUDA installation. If you don't know the path, try `which nvcc`. Take note of the path of UCX installation, echoed as part of the last command.
+
+## 3. Configure and build FlexFlow
+
+Follow steps 1 to 5 in [INSTALL.md](INSTALL.md#1-download-the-source-code) to download the source code, install system dependencies, install the Python dependencies, configure the FlexFlow build, and build FlexFlow **on each instance at the same path**. Or you can use NFS to mount home directory of each instance so that only a single build is necessary.
You can skip step 2 (Install system dependencies) if you have spun up instances with Deep Learning AMI, which comes preconfigured with CUDA. Otherwise, you need to install system dependencies on each instance.
-For step 4 (Configuring the FlexFlow build), make sure to specify a network using the `FF_LEGION_NETWORKS` parameter. We recommend using `FF_LEGION_NETWORKS=gasnet` and `FF_GASNET_CONDUIT=ucx`. Other configurations are optional.
+For step 4 (Configuring the FlexFlow build), here are the parameters that need to be configured:
+* Set `FF_LEGION_NETWORKS=ucx`
+* Set `UCX_DIR` to the UCX installation path mentioned in [Configure and build UCX](#2-configure-and-build-ucx)
-## 3. Configure MPI
+Other configuration options are optional.
+
+## 4. Configure MPI
MPI is an easy way to launch FlexFlow across all instances simultaneously and set up communication between them.
@@ -64,8 +82,9 @@ ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOy5NKYdE8Cwgid59rx6xMqyj9vLaWuXIwy/BSRiK4su
5. Test MPI by running `mpirun -N 1 --hostfile ~/hostfile hostname`. It should display the hostname of all your nodes. If you encounter any errors like `WARNING: Open MPI accepted a TCP connection from what appears to be another Open MPI process but cannot find a corresponding process entry for that peer.`, add the parameter `--mca btl_tcp_if_include` in the `mpirun` command (refer to [this Stack Overflow question](https://stackoverflow.com/questions/15072563/running-mpi-on-two-hosts)).
-## 4. Test FlexFlow
+## 5. Test FlexFlow
+
+Follow step 6 in the [Build from source guide](https://flexflow.readthedocs.io/en/latest/installation.html) to set the environment variables.
-Follow step 6 in [INSTALL.md](INSTALL.md) to set environment variables.
+A script to run a Python example on multiple nodes is available at `scripts/mnist_mlp_run.sh`. Run the script to test FlexFlow on mnist mlp training. You can adjust the script to run any other program. Make sure to change the `FLEXFLOW_DIR` and `UCX_DIR` variables in it to appropriate paths.
-A script to run a Python example on multiple nodes is available at `scripts/mnist_mlp_run.sh`. You can run the script using [`mpirun`](https://www.open-mpi.org/doc/current/man1/mpirun.1.php) (if you configured it in step 3) or [`srun`](https://slurm.schedmd.com/srun.html).
\ No newline at end of file
diff --git a/README.md b/README.md
index 9ad900fb3c..95790a90e5 100644
--- a/README.md
+++ b/README.md
@@ -1,72 +1,54 @@
-# FlexFlow
-![build](https://github.com/flexflow/flexflow/workflows/build/badge.svg?branch=master) ![gpu tests](https://github.com/flexflow/flexflow/workflows/gpu-ci/badge.svg?branch=master) ![multinode gpu tests](https://github.com/flexflow/flexflow/workflows/multinode-test/badge.svg?branch=master) ![docker](https://github.com/flexflow/flexflow/workflows/docker-build/badge.svg?branch=master) ![pip](https://github.com/flexflow/flexflow/workflows/pip-install/badge.svg?branch=master) ![shell-check](https://github.com/flexflow/flexflow/workflows/Shell%20Check/badge.svg?branch=master) ![clang-format](https://github.com/flexflow/flexflow/workflows/clang-format%20Check/badge.svg?branch=master) [![Documentation Status](https://readthedocs.org/projects/flexflow/badge/?version=latest)](https://flexflow.readthedocs.io/en/latest/?badge=latest)
+# FlexFlow: Low-Latency, High-Performance Training and Serving
+![build](https://github.com/flexflow/flexflow/workflows/build/badge.svg?branch=inference) ![gpu tests](https://github.com/flexflow/flexflow/workflows/gpu-ci/badge.svg?branch=inference) ![multinode gpu tests](https://github.com/flexflow/flexflow/workflows/multinode-test/badge.svg?branch=master) ![docker](https://github.com/flexflow/flexflow/workflows/docker-build/badge.svg?branch=inference) ![pip](https://github.com/flexflow/flexflow/workflows/pip-install/badge.svg?branch=inference) ![shell-check](https://github.com/flexflow/flexflow/workflows/Shell%20Check/badge.svg?branch=inference) ![clang-format](https://github.com/flexflow/flexflow/workflows/clang-format%20Check/badge.svg?branch=inference) [![Documentation Status](https://readthedocs.org/projects/flexflow/badge/?version=latest)](https://flexflow.readthedocs.io/en/latest/?badge=latest)
-FlexFlow is a deep learning framework that accelerates distributed DNN training by automatically searching for efficient parallelization strategies. FlexFlow provides a drop-in replacement for PyTorch and TensorFlow Keras. Running existing PyTorch and Keras programs in FlexFlow only requires [a few lines of changes to the program](https://flexflow.ai/keras).
-## Install FlexFlow
-To install FlexFlow from source code, please read the [instructions](https://flexflow.readthedocs.io/en/latest/installation.html). If you would like to quickly try FlexFlow, we also provide pre-built Docker packages for several versions of CUDA and for the `hip_rocm` backend, together with [Dockerfiles](./docker) if you wish to build the containers manually. More info on the Docker images can be found [here](./docker/README.md). You can also use `conda` to install the FlexFlow Python package (coming soon).
+---
-## PyTorch Support
-Users can also use FlexFlow to optimize the parallelization performance of existing PyTorch models in two steps. First, a PyTorch model can be exported to the FlexFlow model format using `flexflow.torch.fx.torch_to_flexflow`.
-```python
-import torch
-import flexflow.torch.fx as fx
+## News 🔥:
-model = MyPyTorchModule()
-fx.torch_to_flexflow(model, "mymodel.ff")
-```
+* [09/02/2023] Adding AMD GPU support, released Docker images for ROCM 5.3->5.6
+* [08/16/2023] Adding Starcoder model support
+* [08/14/2023] Released Docker image for different CUDA versions
+
+## Install FlexFlow
-Second, a FlexFlow program can directly import a previously saved PyTorch model and [autotune](https://www.usenix.org/conference/osdi22/presentation/unger) the parallelization performance for a given parallel machine.
-```python
-from flexflow.pytorch.model import PyTorchModel
+### Requirements
+* OS: Linux
+* GPU backend: Hip-ROCm or CUDA
+ * CUDA version: 10.2 – 12.0
+ * NVIDIA compute capability: 6.0 or higher
+* Python: 3.6 or higher
+* Package dependencies: [see here](https://github.com/flexflow/FlexFlow/blob/inference/requirements.txt)
-def top_level_task():
- torch_model = PyTorchModel("mymodel.ff")
- output_tensor = torch_model.apply(ffmodel, input_tensor)
- ## Model compilation
- ffmodel.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
- ## Model training
- (x_train, y_train) = cifar10.load_data()
- ffmodel.fit(x_train, y_train, epochs=30)
+### Install with pip
+You can install FlexFlow using pip:
+
+```bash
+pip install flexflow
```
-**More FlexFlow PyTorch examples**: see the [pytorch examples folder](https://github.com/flexflow/FlexFlow/tree/master/examples/python/pytorch).
+### Try it in Docker
+If you run into any issue during the install, or if you would like to use the C++ API without needing to install from source, you can also use our pre-built Docker package for different CUDA versions and the `hip_rocm` backend. To download and run our pre-built Docker container:
+
+```bash
+docker run --gpus all -it --rm --shm-size=8g ghcr.io/flexflow/flexflow-cuda-12.0:latest
+```
-## TensorFlow Keras and ONNX Support
-FlexFlow prioritizes PyTorch compatibility, but also includes frontends for [Tensorflow Keras](./docs/source/keras.rst) and [ONNX](./docs/source/onnx.rst) models.
+To download a Docker container for a backend other than CUDA v12.0, you can replace the `cuda-12.0` suffix with any of the following backends: `cuda-11.1`, `cuda-11.6`, `cuda-11.7`, `cuda-11.8`, `cuda-12.0`, `cuda-12.1`, `cuda-12.1`, and `hip_rocm-5.3`, `hip_rocm-5.4`, `hip_rocm-5.5`, `hip_rocm-5.6`. More info on the Docker images, with instructions to build a new image from source, or run with additional configurations, can be found [here](./docker/README.md).
-## C++ Interface
-For users that prefer to program in C/C++. FlexFlow supports a C++ program inference that is equivalent to its Python APIs.
+### Build from source
-**More FlexFlow C++ examples**: see the [C++ examples folder](https://github.com/flexflow/FlexFlow/tree/master/examples/cpp).
+You can install FlexFlow Serve from source code by building the inference branch of FlexFlow. Please follow these [instructions](https://flexflow.readthedocs.io/en/latest/installation.html).
-## Command-Line Flags
-In addition to setting runtime configurations in a FlexFlow Python/C++ program, the FlexFlow runtime also accepts command-line arguments for various runtime parameters:
+## Get Started!
-FlexFlow training flags:
-* `-e` or `--epochs`: number of total epochs to run (default: 1)
-* `-b` or `--batch-size`: global batch size in each iteration (default: 64)
-* `-p` or `--print-freq`: print frequency (default: 10)
-* `-d` or `--dataset`: path to the training dataset. If not set, synthetic data is used to conduct training.
+To get started, check out the quickstart guides below for the FlexFlow training and serving libraries.
-Legion runtime flags:
-* `-ll:gpu`: number of GPU processors to use on each node (default: 0)
-* `-ll:fsize`: size of device memory on each GPU (in MB)
-* `-ll:zsize`: size of zero-copy memory (pinned DRAM with direct GPU access) on each node (in MB). This is used for prefecthing training images from disk.
-* `-ll:cpu`: number of data loading workers (default: 4)
-* `-ll:util`: number of utility threads to create per process (default: 1)
-* `-ll:bgwork`: number of background worker threads to create per process (default: 1)
+* [FlexFlow Train](./TRAIN.md)
+* [FlexFlow Serve](./SERVE.md)
-Performance auto-tuning flags:
-* `--search-budget` or `--budget`: the number of iterations for the MCMC search (default: 0)
-* `--search-alpha` or `--alpha`: a hyper-parameter for the search procedure (default: 0.05)
-* `--export-strategy` or `--export`: path to export the best discovered strategy (default: None)
-* `--import-strategy` or `--import`: path to import a previous saved strategy (default: None)
-* `--enable-parameter-parallel`: allow FlexFlow to explore parameter parallelism for performance auto-tuning. (By default FlexFlow only considers data and model parallelism.)
-* `--enable-attribute-parallel`: allow FlexFlow to explore attribute parallelism for performance auto-tuning. (By default FlexFlow only considers data and model parallelism.)
-For performance tuning related flags: see [performance autotuning](https://flexflow.ai/search).
## Contributing
@@ -75,6 +57,14 @@ Please let us know if you encounter any bugs or have any suggestions by [submitt
We welcome all contributions to FlexFlow from bug fixes to new features and extensions.
## Citations
+
+**FlexFlow Serve:**
+
+* Xupeng Miao, Gabriele Oliaro, Zhihao Zhang, Xinhao Cheng, Zeyu Wang, Rae Ying Yee Wong, Alan Zhu, Lijie Yang, Xiaoxiang Shi, Chunan Shi, Zhuoming Chen, Daiyaan Arfeen, Reyna Abhyankar, Zhihao Jia. [SpecInfer: Accelerating Generative Large Language Model Serving with Speculative Inference and Token Tree Verification](https://arxiv.org/abs/2305.09781). In ArXiV, May 2023.
+
+
+**FlexFlow Train:**
+
* Colin Unger, Zhihao Jia, Wei Wu, Sina Lin, Mandeep Baines, Carlos Efrain Quintero Narvaez, Vinay Ramakrishnaiah, Nirmal Prajapati, Pat McCormick, Jamaludin Mohd-Yusof, Xi Luo, Dheevatsa Mudigere, Jongsoo Park, Misha Smelyanskiy, and Alex Aiken. [Unity: Accelerating DNN Training Through Joint Optimization of Algebraic Transformations and Parallelization](https://www.usenix.org/conference/osdi22/presentation/unger). In Proceedings of the Symposium on Operating Systems Design and Implementation (OSDI), July 2022.
* Zhihao Jia, Matei Zaharia, and Alex Aiken. [Beyond Data and Model Parallelism for Deep Neural Networks](https://cs.stanford.edu/~zhihao/papers/sysml19a.pdf). In Proceedings of the 2nd Conference on Machine Learning and Systems (MLSys), April 2019.
@@ -86,3 +76,4 @@ FlexFlow is developed and maintained by teams at CMU, Facebook, Los Alamos Natio
## License
FlexFlow uses Apache License 2.0.
+
diff --git a/SERVE.md b/SERVE.md
new file mode 100644
index 0000000000..9472d50a62
--- /dev/null
+++ b/SERVE.md
@@ -0,0 +1,275 @@
+# FlexFlow Serve: Low-Latency, High-Performance LLM Serving
+
+
+## What is FlexFlow Serve
+
+The high computational and memory requirements of generative large language
+models (LLMs) make it challenging to serve them quickly and cheaply.
+FlexFlow Serve is an open-source compiler and distributed system for
+__low latency__, __high performance__ LLM serving. FlexFlow Serve outperforms
+existing systems by 1.3-2.0x for single-node, multi-GPU inference and by
+1.4-2.4x for multi-node, multi-GPU inference.
+
+
+
+
+
+
+## Quickstart
+The following example shows how to deploy an LLM using FlexFlow Serve and accelerate its serving using [speculative inference](#speculative-inference). First, we import `flexflow.serve` and initialize the FlexFlow Serve runtime. Note that `memory_per_gpu` and `zero_copy_memory_per_node` specify the size of device memory on each GPU (in MB) and zero-copy memory on each node (in MB), respectively.
+We need to make sure the aggregated GPU memory and zero-copy memory are **both** sufficient to store LLM parameters in non-offloading serving. FlexFlow Serve combines tensor and pipeline model parallelism for LLM serving.
+```python
+import flexflow.serve as ff
+
+ff.init(
+ num_gpus=4,
+ memory_per_gpu=14000,
+ zero_copy_memory_per_node=30000,
+ tensor_parallelism_degree=4,
+ pipeline_parallelism_degree=1
+ )
+```
+Second, we specify the LLM to serve and the SSM(s) used to accelerate LLM serving. The list of supported LLMs and SSMs is available at [supported models](#supported-llms-and-ssms).
+```python
+# Specify the LLM
+llm = ff.LLM("meta-llama/Llama-2-7b-hf")
+
+# Specify a list of SSMs (just one in this case)
+ssms=[]
+ssm = ff.SSM("JackFram/llama-68m")
+ssms.append(ssm)
+```
+Next, we declare the generation configuration and compile both the LLM and SSMs. Note that all SSMs should run in the **beam search** mode, and the LLM should run in the **tree verification** mode to verify the speculated tokens from SSMs.
+```python
+# Create the sampling configs
+generation_config = ff.GenerationConfig(
+ do_sample=False, temperature=0.9, topp=0.8, topk=1
+)
+
+# Compile the SSMs for inference and load the weights into memory
+for ssm in ssms:
+ ssm.compile(generation_config)
+
+# Compile the LLM for inference and load the weights into memory
+llm.compile(generation_config, ssms=ssms)
+```
+Finally, we call `llm.generate` to generate the output, which is organized as a list of `GenerationResult`, which include the output tokens and text.
+```python
+result = llm.generate("Here are some travel tips for Tokyo:\n")
+```
+
+### Incremental decoding
+
+
+Expand here
+
+
+```python
+
+import flexflow.serve as ff
+
+# Initialize the FlexFlow runtime. ff.init() takes a dictionary (as a positional argument) or named key-value parameters
+ff.init(
+ num_gpus=4,
+ memory_per_gpu=14000,
+ zero_copy_memory_per_node=30000,
+ tensor_parallelism_degree=4,
+ pipeline_parallelism_degree=1
+ )
+
+# Create the FlexFlow LLM
+llm = ff.LLM("meta-llama/Llama-2-7b-hf")
+
+# Create the sampling configs
+generation_config = ff.GenerationConfig(
+ do_sample=True, temperature=0.9, topp=0.8, topk=1
+)
+
+# Compile the LLM for inference and load the weights into memory
+llm.compile(generation_config)
+
+# Generation begins!
+result = llm.generate("Here are some travel tips for Tokyo:\n")
+
+```
+
+
+
+### C++ interface
+If you'd like to use the C++ interface (mostly used for development and benchmarking purposes), you should install from source, and follow the instructions below.
+
+
+Expand here
+
+
+#### Downloading models
+
+Before running FlexFlow Serve, you should manually download the LLM and SSM(s) model of interest using the [inference/utils/download_hf_model.py](https://github.com/flexflow/FlexFlow/blob/inference/inference/utils/download_hf_model.py) script (see example below). By default, the script will download all of a model's assets (weights, configs, tokenizer files, etc...) into the cache folder `~/.cache/flexflow`. If you would like to use a different folder, you can request that via the parameter `--cache-folder`.
+
+```bash
+python3 ./inference/utils/download_hf_model.py ...
+```
+
+#### Running the C++ examples
+A C++ example is available at [this folder](../inference/spec_infer/). After building FlexFlow Serve, the executable will be available at `/build_dir/inference/spec_infer/spec_infer`. You can use the following command-line arguments to run FlexFlow Serve:
+
+* `-ll:gpu`: number of GPU processors to use on each node for serving an LLM (default: 0)
+* `-ll:fsize`: size of device memory on each GPU in MB
+* `-ll:zsize`: size of zero-copy memory (pinned DRAM with direct GPU access) in MB. FlexFlow Serve keeps a replica of the LLM parameters on zero-copy memory, and therefore requires that the zero-copy memory is sufficient for storing the LLM parameters.
+* `-llm-model`: the LLM model ID from HuggingFace (e.g. "meta-llama/Llama-2-7b-hf")
+* `-ssm-model`: the SSM model ID from HuggingFace (e.g. "JackFram/llama-160m"). You can use multiple `-ssm-model`s in the command line to launch multiple SSMs.
+* `-cache-folder`: the folder
+* `-data-parallelism-degree`, `-tensor-parallelism-degree` and `-pipeline-parallelism-degree`: parallelization degrees in the data, tensor, and pipeline dimensions. Their product must equal the number of GPUs available on the machine. When any of the three parallelism degree arguments is omitted, a default value of 1 will be used.
+* `-prompt`: (optional) path to the prompt file. FlexFlow Serve expects a json format file for prompts. In addition, users can also use the following API for registering requests:
+* `-output-file`: (optional) filepath to use to save the output of the model, together with the generation latency
+
+For example, you can use the following command line to serve a LLaMA-7B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-68M models for speculative inference.
+
+```bash
+./inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion
+```
+
+
+## Speculative Inference
+A key technique that enables FlexFlow Serve to accelerate LLM serving is speculative
+inference, which combines various collectively boost-tuned small speculative
+models (SSMs) to jointly predict the LLM’s outputs; the predictions are organized as a
+token tree, whose nodes each represent a candidate token sequence. The correctness
+of all candidate token sequences represented by a token tree is verified against the
+LLM’s output in parallel using a novel tree-based parallel decoding mechanism.
+FlexFlow Serve uses an LLM as a token tree verifier instead of an incremental decoder,
+which largely reduces the end-to-end inference latency and computational requirement
+for serving generative LLMs while provably preserving model quality.
+
+
+
+
+
+### Supported LLMs and SSMs
+
+FlexFlow Serve currently supports all HuggingFace models with the following architectures:
+* `LlamaForCausalLM` / `LLaMAForCausalLM` (e.g. LLaMA/LLaMA-2, Guanaco, Vicuna, Alpaca, ...)
+* `OPTForCausalLM` (models from the OPT family)
+* `RWForCausalLM` (models from the Falcon family)
+* `GPTBigCodeForCausalLM` (models from the Starcoder family)
+
+Below is a list of models that we have explicitly tested and for which a SSM may be available:
+
+| Model | Model id on HuggingFace | Boost-tuned SSMs |
+| :---- | :---- | :---- |
+| LLaMA-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-13B | decapoda-research/llama-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-30B | decapoda-research/llama-30b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-65B | decapoda-research/llama-65b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-2-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-2-13B | meta-llama/Llama-2-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-2-70B | meta-llama/Llama-2-70b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| OPT-6.7B | facebook/opt-6.7b | [OPT-125M](https://huggingface.co/facebook/opt-125m) |
+| OPT-13B | facebook/opt-13b | [OPT-125M](https://huggingface.co/facebook/opt-125m) |
+| OPT-30B | facebook/opt-30b | [OPT-125M](https://huggingface.co/facebook/opt-125m) |
+| OPT-66B | facebook/opt-66b | [OPT-125M](https://huggingface.co/facebook/opt-125m) |
+| Falcon-7B | tiiuae/falcon-7b | |
+| Falcon-40B | tiiuae/falcon-40b | |
+| StarCoder-15.5B | bigcode/starcoder | |
+
+
+### CPU Offloading
+FlexFlow Serve also offers offloading-based inference for running large models (e.g., llama-7B) on a single GPU. CPU offloading is a choice to save tensors in CPU memory, and only copy the tensor to GPU when doing calculation. Notice that now we selectively offload the largest weight tensors (weights tensor in Linear, Attention). Besides, since the small model occupies considerably less space, it it does not pose a bottleneck for GPU memory, the offloading will bring more runtime space and computational cost, so we only do the offloading for the large model. [TODO: update instructions] You can run the offloading example by enabling the `-offload` and `-offload-reserve-space-size` flags.
+
+### Quantization
+FlexFlow Serve supports int4 and int8 quantization. The compressed tensors are stored on the CPU side. Once copied to the GPU, these tensors undergo decompression and conversion back to their original precision. Please find the compressed weight files in our s3 bucket, or use [this script](../inference/utils/compress_llama_weights.py) from [FlexGen](https://github.com/FMInference/FlexGen) project to do the compression manually. [TODO: update instructions for quantization].
+
+### Prompt Datasets
+We provide five prompt datasets for evaluating FlexFlow Serve: [Chatbot instruction prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatbot.json), [ChatGPT Prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatgpt.json), [WebQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/webqa.json), [Alpaca](https://specinfer.s3.us-east-2.amazonaws.com/prompts/alpaca.json), and [PIQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/piqa.json).
+
+
+
+
+## Python Interface Features and Interaction Methods
+
+FlexFlow Serve provides a comprehensive Python interface for serving with low latency and high performance. This interface facilitates the deployment and interaction with the serving platform for a variety of applications, from chatbots and prompt templates to retrieval augmented generation and API services.
+
+### Chatbot with Gradio
+
+The Python interface allows setting up a chatbot application using Gradio, enabling interactive dialogues with users through a user-friendly web interface.
+
+#### Implementation Steps
+1. **FlexFlow Initialization:** Configure and initialize FlexFlow Serve with the desired settings and the specific LLM.
+```python
+import gradio as gr
+import flexflow.serve as ff
+
+ff.init(num_gpus=2, memory_per_gpu=14000, ...)
+```
+2. **Gradio Interface Setup:** Implement a function to generate responses from user inputs and set up the Gradio Chat Interface for interaction.
+```python
+def generate_response(user_input):
+ result = llm.generate(user_input)
+ return result.output_text.decode('utf-8')
+```
+3. **Running the Interface:** Launch the Gradio interface to interact with the LLM through a web-based chat interface.
+```python
+iface = gr.ChatInterface(fn=generate_response)
+iface.launch()
+```
+4. **Shutdown:** Properly stop the FlexFlow server after interaction is complete.
+
+
+
+### Langchain Usecases
+FlexFlow Serve supports langchain usecases including dynamic prompt template handling and RAG usecases, enabling the customization of model responses based on structured input templates and Retrieval Augmented Generation.
+
+#### Implementation Steps
+1. **FlexFlow Initialization**: Start by initializing FlexFlow Serve with the appropriate configurations.
+2. **LLM Setup**: Compile and load the LLM for text generation.
+3. **Prompt Template/RAG Setup**: Configure prompt templates to guide the model's responses.
+4. **Response Generation**: Use the LLM with the prompt template to generate responses.
+
+
+### Python FastAPI Entrypoint
+Flexflow Serve also supports deploying and managing LLMs with FastAPI, offering a RESTful API interface for generating responses from models.
+
+```python
+@app.on_event("startup")
+async def startup_event():
+ global llm
+ # Initialize and compile the LLM model
+ llm.compile(
+ generation_config,
+ # ... other params as needed
+ )
+ llm.start_server()
+
+@app.post("/generate/")
+async def generate(prompt_request: PromptRequest):
+ # ... exception handling
+ full_output = llm.generate([prompt_request.prompt])[0].output_text.decode('utf-8')
+ # ... split prompt and response text for returning results
+ return {"prompt": prompt_request.prompt, "response": full_output}
+```
+
+
+
+
+## TODOs
+
+FlexFlow Serve is still under active development. We currently focus on the following tasks and strongly welcome all contributions from bug fixes to new features and extensions.
+
+* AMD benchmarking. We are actively working on benchmarking FlexFlow Serve on AMD GPUs and comparing it with the performance on NVIDIA GPUs.
+
+## Acknowledgements
+This project is initiated by members from CMU, Stanford, and UCSD. We will be continuing developing and supporting FlexFlow Serve. Please cite FlexFlow Serve as:
+
+``` bibtex
+@misc{miao2023specinfer,
+ title={SpecInfer: Accelerating Generative Large Language Model Serving with Speculative Inference and Token Tree Verification},
+ author={Xupeng Miao and Gabriele Oliaro and Zhihao Zhang and Xinhao Cheng and Zeyu Wang and Rae Ying Yee Wong and Alan Zhu and Lijie Yang and Xiaoxiang Shi and Chunan Shi and Zhuoming Chen and Daiyaan Arfeen and Reyna Abhyankar and Zhihao Jia},
+ year={2023},
+ eprint={2305.09781},
+ archivePrefix={arXiv},
+ primaryClass={cs.CL}
+}
+```
+
+## License
+FlexFlow uses Apache License 2.0.
diff --git a/TRAIN.md b/TRAIN.md
new file mode 100644
index 0000000000..1595274a4c
--- /dev/null
+++ b/TRAIN.md
@@ -0,0 +1,65 @@
+# FlexFlow Train: Distributed DNN Training with Flexible Parallelization Strategies.
+FlexFlow Train is a deep learning framework that accelerates distributed DNN training by automatically searching for efficient parallelization strategies. FlexFlow Train provides a drop-in replacement for PyTorch and TensorFlow Keras. Running existing PyTorch and Keras programs in FlexFlow oTrain nly requires [a few lines of changes to the program](https://flexflow.ai/keras).
+
+
+## PyTorch Support
+Users can also use FlexFlow Train to optimize the parallelization performance of existing PyTorch models in two steps. First, a PyTorch model can be exported to the FlexFlow model format using `flexflow.torch.fx.torch_to_flexflow`.
+```python
+import torch
+import flexflow.torch.fx as fx
+
+model = MyPyTorchModule()
+fx.torch_to_flexflow(model, "mymodel.ff")
+```
+
+Second, a FlexFlow Train program can directly import a previously saved PyTorch model and [autotune](https://www.usenix.org/conference/osdi22/presentation/unger) the parallelization performance for a given parallel machine.
+
+```python
+from flexflow.pytorch.model import PyTorchModel
+
+def top_level_task():
+ torch_model = PyTorchModel("mymodel.ff")
+ output_tensor = torch_model.apply(ffmodel, input_tensor)
+ ## Model compilation
+ ffmodel.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
+ ## Model training
+ (x_train, y_train) = cifar10.load_data()
+ ffmodel.fit(x_train, y_train, epochs=30)
+```
+
+**More FlexFlow PyTorch examples**: see the [pytorch examples folder](https://github.com/flexflow/FlexFlow/tree/master/examples/python/pytorch).
+
+## TensorFlow Keras and ONNX Support
+FlexFlow Train prioritizes PyTorch compatibility, but also includes frontends for [Tensorflow Keras](./docs/source/keras.rst) and [ONNX](./docs/source/onnx.rst) models.
+
+## C++ Interface
+For users that prefer to program in C/C++. FlexFlow Train supports a C++ program inference that is equivalent to its Python APIs.
+
+**More FlexFlow C++ examples**: see the [C++ examples folder](https://github.com/flexflow/FlexFlow/tree/master/examples/cpp).
+
+
+## Command-Line Flags
+In addition to setting runtime configurations in a FlexFlow Train Python/C++ program, the FlexFlow Train runtime also accepts command-line arguments for various runtime parameters:
+
+FlexFlow training flags:
+* `-e` or `--epochs`: number of total epochs to run (default: 1)
+* `-b` or `--batch-size`: global batch size in each iteration (default: 64)
+* `-p` or `--print-freq`: print frequency (default: 10)
+* `-d` or `--dataset`: path to the training dataset. If not set, synthetic data is used to conduct training.
+
+Legion runtime flags:
+* `-ll:gpu`: number of GPU processors to use on each node (default: 0)
+* `-ll:fsize`: size of device memory on each GPU (in MB)
+* `-ll:zsize`: size of zero-copy memory (pinned DRAM with direct GPU access) on each node (in MB). This is used for prefecthing training images from disk.
+* `-ll:cpu`: number of data loading workers (default: 4)
+* `-ll:util`: number of utility threads to create per process (default: 1)
+* `-ll:bgwork`: number of background worker threads to create per process (default: 1)
+
+Performance auto-tuning flags:
+* `--search-budget` or `--budget`: the number of iterations for the MCMC search (default: 0)
+* `--search-alpha` or `--alpha`: a hyper-parameter for the search procedure (default: 0.05)
+* `--export-strategy` or `--export`: path to export the best discovered strategy (default: None)
+* `--import-strategy` or `--import`: path to import a previous saved strategy (default: None)
+* `--enable-parameter-parallel`: allow FlexFlow Train to explore parameter parallelism for performance auto-tuning. (By default FlexFlow Train only considers data and model parallelism.)
+* `--enable-attribute-parallel`: allow FlexFlow Train to explore attribute parallelism for performance auto-tuning. (By default FlexFlow Train only considers data and model parallelism.)
+For performance tuning related flags: see [performance autotuning](https://flexflow.ai/search).
diff --git a/bootcamp_demo/ff_alexnet_cifar10.py b/bootcamp_demo/ff_alexnet_cifar10.py
deleted file mode 100644
index cb0b0e99ad..0000000000
--- a/bootcamp_demo/ff_alexnet_cifar10.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#./flexflow_python $FF_HOME/bootcamp_demo/ff_alexnet_cifar10.py -ll:py 1 -ll:gpu 1 -ll:fsize 2048 -ll:zsize 12192
-
-from flexflow.core import *
-from flexflow.keras.datasets import cifar10
-from flexflow.torch.model import PyTorchModel
-from PIL import Image
-
-def top_level_task():
- ffconfig = FFConfig()
- ffconfig.parse_args()
- print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)" %(ffconfig.get_batch_size(), ffconfig.get_workers_per_node(), ffconfig.get_num_nodes()))
- ffmodel = FFModel(ffconfig)
-
- dims_input = [ffconfig.get_batch_size(), 3, 229, 229]
- input_tensor = ffmodel.create_tensor(dims_input, DataType.DT_FLOAT)
-
- torch_model = PyTorchModel("alexnet.ff")
- output_tensors = torch_model.apply(ffmodel, [input_tensor])
-
- ffoptimizer = SGDOptimizer(ffmodel, 0.01)
- ffmodel.set_sgd_optimizer(ffoptimizer)
- ffmodel.compile(loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
- label_tensor = ffmodel.get_label_tensor()
-
- num_samples = 10000
-
- (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-
- full_input_np = np.zeros((num_samples, 3, 229, 229), dtype=np.float32)
-
- for i in range(0, num_samples):
- image = x_train[i, :, :, :]
- image = image.transpose(1, 2, 0)
- pil_image = Image.fromarray(image)
- pil_image = pil_image.resize((229,229), Image.NEAREST)
- image = np.array(pil_image, dtype=np.float32)
- image = image.transpose(2, 0, 1)
- full_input_np[i, :, :, :] = image
-
- full_input_np /= 255
-
- y_train = y_train.astype('int32')
- full_label_np = y_train
-
- dataloader_input = ffmodel.create_data_loader(input_tensor, full_input_np)
- dataloader_label = ffmodel.create_data_loader(label_tensor, full_label_np)
-
- num_samples = dataloader_input.num_samples
-
- ffmodel.init_layers()
-
- epochs = ffconfig.get_epochs()
-
- ts_start = ffconfig.get_current_time()
-
- ffmodel.fit(x=dataloader_input, y=dataloader_label, epochs=epochs)
-
- ts_end = ffconfig.get_current_time()
- run_time = 1e-6 * (ts_end - ts_start);
- print("epochs %d, ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n" %(epochs, run_time, num_samples * epochs / run_time));
-
- # perf_metrics = ffmodel.get_perf_metrics()
- # accuracy = perf_metrics.get_accuracy()
- # if accuracy < ModelAccuracy.CIFAR10_CNN.value:
- # assert 0, 'Check Accuracy'
-
-
-if __name__ == "__main__":
- print("cifar10 cnn")
- top_level_task()
diff --git a/bootcamp_demo/keras_cnn_cifar10.py b/bootcamp_demo/keras_cnn_cifar10.py
deleted file mode 100644
index a62f625449..0000000000
--- a/bootcamp_demo/keras_cnn_cifar10.py
+++ /dev/null
@@ -1,56 +0,0 @@
-#./flexflow_python $FF_HOME/bootcamp_demo/keras_cnn_cifar10.py -ll:py 1 -ll:gpu 1 -ll:fsize 2048 -ll:zsize 12192
-
-# from keras.models import Model, Sequential
-# from keras.layers import Input, Flatten, Dense, Activation, Conv2D, MaxPooling2D, Dropout
-# from keras.optimizers import SGD
-# from keras.datasets import cifar10
-# from keras import losses
-# from keras import metrics
-
-from flexflow.keras.models import Model, Sequential
-from flexflow.keras.layers import Input, Flatten, Dense, Activation, Conv2D, MaxPooling2D, Dropout
-from flexflow.keras.optimizers import SGD
-from flexflow.keras.datasets import cifar10
-from flexflow.keras import losses
-from flexflow.keras import metrics
-
-import numpy as np
-
-def top_level_task():
- num_classes = 10
-
- num_samples = 10000
-
- #(x_train, y_train), (x_test, y_test) = cifar10.load_data()
- (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-
- x_train = x_train.astype('float32')
- x_train /= 255
- y_train = y_train.astype('int32')
- print("shape: ", x_train.shape[1:])
-
- model = Sequential()
-
- model.add(Conv2D(filters=32, input_shape=(3,32,32), kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu"))
- model.add(Conv2D(filters=32, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu"))
- model.add(MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid"))
- model.add(Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu"))
- model.add(Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding="valid"))
- model.add(Activation("relu"))
- model.add(MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid"))
- model.add(Flatten())
- model.add(Dense(512))
- model.add(Activation("relu"))
- model.add(Dropout(0.5))
- model.add(Dense(num_classes))
- model.add(Activation("softmax"))
-
- opt = SGD(learning_rate=0.01)
- model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
- print(model.summary())
-
- model.fit(x_train, y_train, batch_size=64, epochs=4)
-
-if __name__ == "__main__":
- print("Functional API, cifar10 cnn")
- top_level_task()
\ No newline at end of file
diff --git a/bootcamp_demo/torch_alexnet_cifar10.py b/bootcamp_demo/torch_alexnet_cifar10.py
deleted file mode 100644
index 394161c5a3..0000000000
--- a/bootcamp_demo/torch_alexnet_cifar10.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#./flexflow_python $FF_HOME/bootcamp_demo/torch_alexnet_cifar10.py -ll:py 1 -ll:gpu 1 -ll:fsize 2048 -ll:zsize 12192
-
-# https://github.com/pytorch/vision/blob/master/torchvision/models/alexnet.py
-
-import torch.nn as nn
-import torch
-import flexflow.torch.fx as fx
-import torchvision.models as models
-
-class AlexNet(nn.Module):
- def __init__(self, num_classes: int = 1000) -> None:
- super(AlexNet, self).__init__()
- self.features = nn.Sequential(
- nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
- nn.ReLU(inplace=True),
- nn.MaxPool2d(kernel_size=3, stride=2),
- nn.Conv2d(64, 192, kernel_size=5, padding=2),
- nn.ReLU(inplace=True),
- nn.MaxPool2d(kernel_size=3, stride=2),
- nn.Conv2d(192, 384, kernel_size=3, padding=1),
- nn.ReLU(inplace=True),
- nn.Conv2d(384, 256, kernel_size=3, padding=1),
- nn.ReLU(inplace=True),
- nn.Conv2d(256, 256, kernel_size=3, padding=1),
- nn.ReLU(inplace=True),
- nn.MaxPool2d(kernel_size=3, stride=2),
- )
- self.classifier = nn.Sequential(
- nn.Linear(256 * 6 * 6, 4096),
- nn.ReLU(inplace=True),
- nn.Linear(4096, 4096),
- nn.ReLU(inplace=True),
- nn.Linear(4096, num_classes),
- nn.Softmax(),
- )
-
- def forward(self, x: torch.Tensor) -> torch.Tensor:
- x = self.features(x)
- x = torch.flatten(x, 1)
- x = self.classifier(x)
- return x
-
-model = AlexNet(num_classes=10)
-fx.torch_to_flexflow(model, "alexnet.ff")
\ No newline at end of file
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index f4111d8ea6..45ecc1798b 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -13,8 +13,19 @@ if(CUDA_FOUND)
# set cuda runtime and driver lib
# override cublas and curand because the FindCUDA module may not find the correct libs
set(CUDADRV_LIBRARIES ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libcuda${LIBEXT})
- set(CUDA_CUBLAS_LIBRARIES ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcublas${LIBEXT})
- set(CUDA_curand_LIBRARY ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcurand${LIBEXT})
+ if(CUBLAS_PATH)
+ set(CUBLAS_ROOT ${CUBLAS_PATH})
+ else()
+ set(CUBLAS_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
+ endif()
+ set(CUDA_CUBLAS_LIBRARIES ${CUBLAS_ROOT}/lib64/libcublas${LIBEXT})
+ if(CURAND_PATH)
+ set(CURAND_ROOT ${CURAND_PATH})
+ else()
+ set(CURAND_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
+ endif()
+ set(CUDA_curand_LIBRARY ${CURAND_ROOT}/lib64/libcurand${LIBEXT})
+
list(APPEND FLEXFLOW_EXT_LIBRARIES
${CUDADRV_LIBRARIES}
${CUDA_CUBLAS_LIBRARIES}
@@ -53,8 +64,12 @@ if(CUDA_FOUND)
message( STATUS "CUDA Detected CUDA_ARCH : ${DETECTED_CUDA_ARCH}" )
set(FF_CUDA_ARCH ${DETECTED_CUDA_ARCH})
# Set FF_CUDA_ARCH to the list of all GPU architectures compatible with FlexFlow
- elseif("${FF_CUDA_ARCH}" STREQUAL "all")
- set(FF_CUDA_ARCH 60,61,62,70,72,75,80,86)
+ elseif("${FF_CUDA_ARCH}" STREQUAL "all")
+ if(CUDA_VERSION VERSION_GREATER_EQUAL "11.8")
+ set(FF_CUDA_ARCH 60,61,62,70,72,75,80,86,90)
+ else()
+ set(FF_CUDA_ARCH 60,61,62,70,72,75,80,86)
+ endif()
endif()
# create CUDA_GENCODE list based on FF_CUDA_ARCH
@@ -66,6 +81,7 @@ if(CUDA_FOUND)
endforeach()
string(REGEX REPLACE "([0-9]+)" "-gencode arch=compute_\\1,code=sm_\\1" CUDA_GENCODE "${CUDA_GENCODE}")
+ set(CMAKE_CUDA_COMPILER "${CUDA_NVCC_EXECUTABLE}")
#output
message( STATUS "CUDA_VERSION: ${CUDA_VERSION}")
message( STATUS "CUDA root path : ${CUDA_TOOLKIT_ROOT_DIR}" )
@@ -76,6 +92,7 @@ if(CUDA_FOUND)
message( STATUS "CURAND libraries : ${CUDA_curand_LIBRARY}" )
message( STATUS "CUDA Arch : ${FF_CUDA_ARCH}" )
message( STATUS "CUDA_GENCODE: ${CUDA_GENCODE}")
+ message( STATUS "CMAKE_CUDA_COMPILER: ${CMAKE_CUDA_COMPILER}")
list(APPEND FLEXFLOW_INCLUDE_DIRS
${CUDA_INCLUDE_DIRS})
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
new file mode 100644
index 0000000000..25f2e05e19
--- /dev/null
+++ b/cmake/hip.cmake
@@ -0,0 +1,12 @@
+if (NOT FF_HIP_ARCH STREQUAL "")
+ if (FF_HIP_ARCH STREQUAL "all")
+ set(FF_HIP_ARCH "gfx900,gfx902,gfx904,gfx906,gfx908,gfx909,gfx90a,gfx90c,gfx940,gfx1010,gfx1011,gfx1012,gfx1013,gfx1030,gfx1031,gfx1032,gfx1033,gfx1034,gfx1035,gfx1036,gfx1100,gfx1101,gfx1102,gfx1103")
+ endif()
+ string(REPLACE "," "," HIP_ARCH_LIST "${FF_HIP_ARCH}")
+endif()
+
+message(STATUS "FF_HIP_ARCH: ${FF_HIP_ARCH}")
+if(FF_GPU_BACKEND STREQUAL "hip_rocm")
+ #set(HIP_CLANG_PATH ${ROCM_PATH}/llvm/bin CACHE STRING "Path to the clang compiler by ROCM" FORCE)
+ set(GPU_TARGETS "${FF_HIP_ARCH}" CACHE STRING "The GPU TARGETs")
+endif()
diff --git a/cmake/legion.cmake b/cmake/legion.cmake
index b4cfad20e2..2afb507d3b 100644
--- a/cmake/legion.cmake
+++ b/cmake/legion.cmake
@@ -132,6 +132,10 @@ else()
set(Legion_EMBED_GASNet_VERSION "GASNet-2022.3.0" CACHE STRING "GASNet version")
set(Legion_NETWORKS "gasnetex" CACHE STRING "GASNet conduit")
set(GASNet_CONDUIT ${FF_GASNET_CONDUIT})
+ elseif("${FF_LEGION_NETWORKS}" STREQUAL "ucx")
+ set(ucx_ROOT ${UCX_PATH}/lib/cmake)
+ message(STATUS "Find ucx: ${UCX_PATH}")
+ set(Legion_NETWORKS "ucx" CACHE STRING "Enable UCX")
endif()
message(STATUS "GASNET ROOT: $ENV{GASNet_ROOT_DIR}")
set(Legion_MAX_DIM ${FF_MAX_DIM} CACHE STRING "Maximum number of dimensions")
@@ -142,8 +146,11 @@ else()
set(Legion_USE_HIP ON CACHE BOOL "enable Legion_USE_HIP" FORCE)
if (FF_GPU_BACKEND STREQUAL "hip_cuda")
set(Legion_HIP_TARGET "CUDA" CACHE STRING "Legion_HIP_TARGET CUDA" FORCE)
+ set(Legion_CUDA_ARCH ${FF_CUDA_ARCH} CACHE STRING "Legion CUDA ARCH" FORCE)
elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
set(Legion_HIP_TARGET "ROCM" CACHE STRING "Legion HIP_TARGET ROCM" FORCE)
+ set(Legion_HIP_ARCH ${FF_HIP_ARCH} CACHE STRING "Legion HIP ARCH" FORCE)
+ message(STATUS "Legion_HIP_ARCH: ${Legion_HIP_ARCH}")
endif()
endif()
set(Legion_REDOP_COMPLEX OFF CACHE BOOL "disable complex")
diff --git a/cmake/nccl.cmake b/cmake/nccl.cmake
index 04a23dcb8a..c140a44ec8 100644
--- a/cmake/nccl.cmake
+++ b/cmake/nccl.cmake
@@ -109,8 +109,9 @@ else()
message( STATUS "NCCL include : ${NCCL_INCLUDE_DIRS}" )
message( STATUS "NCCL libraries : ${NCCL_LIBRARIES}" )
add_library(nccl SHARED IMPORTED)
+
+ # Build NCCL from source
else()
- # Build NCCL from source
message(STATUS "Building NCCL from source")
list(TRANSFORM CUDA_GENCODE PREPEND "NVCC_GENCODE=" OUTPUT_VARIABLE NCCL_BUILD_NVCC_GENCODE)
diff --git a/cmake/pip_install/CMakeLists.txt b/cmake/pip_install/CMakeLists.txt
index b7795daf71..217d7e14f0 100644
--- a/cmake/pip_install/CMakeLists.txt
+++ b/cmake/pip_install/CMakeLists.txt
@@ -1,10 +1,26 @@
# Use setup.py script to re-install the Python bindings library with the right library paths
if (FF_USE_PYTHON)
- execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "from distutils import sysconfig; print(sysconfig.get_python_lib(plat_specific=False,standard_lib=False))" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)
+ execute_process(COMMAND ${Python_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)
if(FF_BUILD_FROM_PYPI)
- install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E echo \"Editing path to Legion library using path: ${PY_DEST}/flexflow/lib \")")
+ cmake_path(SET CMAKE_SOURCE_DIR_ NORMALIZE ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion)
+ cmake_path(SET CMAKE_BUILD_DIR_ NORMALIZE ${Legion_BINARY_DIR}/runtime)
+ cmake_path(SET CMAKE_INSTALL_PREFIX_ NORMALIZE ${PY_DEST}/../../..)
+ cmake_path(SET WORKING_DIRECTORY_ NORMALIZE ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/)
# CMAKE_CURRENT_SOURCE_DIR=/usr/FlexFlow/cmake/pip_install
# Legion_BINARY_DIR=/usr/FlexFlow/build//deps/legion
- install(CODE "execute_process(COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/setup.py install --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${PY_DEST}/flexflow ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python)")
+ # CMAKE_SOURCE_DIR_=/usr/FlexFlow/deps/legion
+ # CMAKE_BUILD_DIR_: /usr/FlexFlow/build//deps/legion/runtime
+ # CMAKE_INSTALL_PREFIX_: /opt/conda/ or /usr/local
+ # WORKING_DIRECTORY_: /usr/FlexFlow/deps/legion/bindings/python/
+ # PY_DEST: /python3.11/site-packages
+ message(STATUS "CMAKE_CURRENT_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}")
+ message(STATUS "Legion_BINARY_DIR: ${Legion_BINARY_DIR}")
+ message(STATUS "CMAKE_SOURCE_DIR_: ${CMAKE_SOURCE_DIR_}")
+ message(STATUS "CMAKE_BUILD_DIR_: ${CMAKE_BUILD_DIR_}")
+ message(STATUS "CMAKE_INSTALL_PREFIX_: ${CMAKE_INSTALL_PREFIX_}")
+ message(STATUS "WORKING_DIRECTORY_: ${WORKING_DIRECTORY_}")
+ message(STATUS "PY_DEST: ${PY_DEST}")
+ install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E echo \"Editing path to Legion library using path: ${CMAKE_INSTALL_PREFIX_} \")")
+ install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E env CMAKE_SOURCE_DIR=${CMAKE_SOURCE_DIR_} CMAKE_BUILD_DIR=${CMAKE_BUILD_DIR_} CMAKE_INSTALL_PREFIX=${PY_DEST}/flexflow ${Python3_EXECUTABLE} setup.py install --prefix ${CMAKE_INSTALL_PREFIX_} ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${WORKING_DIRECTORY_} COMMAND_ECHO STDOUT COMMAND_ERROR_IS_FATAL ANY)")
endif()
endif()
diff --git a/cmake/zlib.cmake b/cmake/zlib.cmake
deleted file mode 100644
index 0281e02b88..0000000000
--- a/cmake/zlib.cmake
+++ /dev/null
@@ -1,8 +0,0 @@
-find_package(ZLIB REQUIRED)
-if(ZLIB_FOUND)
- list(APPEND FLEXFLOW_EXT_LIBRARIES
- ${ZLIB_LIBRARIES})
- message( STATUS "ZLIB libraries : ${ZLIB_LIBRARIES}" )
-else()
- message( FATAL_ERROR "ZLIB package not found")
-endif()
\ No newline at end of file
diff --git a/conda/build.sh b/conda/build.sh
deleted file mode 100755
index 0e84b7489a..0000000000
--- a/conda/build.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#! /usr/bin/env bash
-set -euo pipefail
-
-# Cd into FF_HOME
-cd "${BASH_SOURCE[0]%/*}/../"
-
-# build flexflow
-# "search and replace" bash syntax used below to make shellcheck happy.
-# see here: https://wiki-dev.bash-hackers.org/syntax/pe
-CXXFLAGS="${CXXFLAGS//-O2/}"
-CXXFLAGS="${CXXFLAGS//-std=c++17/}"
-CXXFLAGS="${CXXFLAGS//-DNDEBUG/}"
-CXXFLAGS="${CXXFLAGS//-D_FORTIFY_SOURCE=2/}"
-export CXXFLAGS
-CPPFLAGS="${CPPFLAGS//-O2/}"
-CPPFLAGS="${CPPFLAGS//-std=c++17/}"
-CPPFLAGS="${CPPFLAGS//-DNDEBUG/}"
-CPPFLAGS="${CPPFLAGS//-D_FORTIFY_SOURCE=2/}"
-export CPPFLAGS
-
-#export CUDNN_HOME=/projects/opt/centos7/cuda/10.1
-#export CUDA_HOME=/projects/opt/centos7/cuda/10.1
-export PROTOBUF_DIR=$BUILD_PREFIX
-export FF_HOME=$SRC_DIR
-export LG_RT_DIR=$SRC_DIR/legion/runtime
-#export FF_ENABLE_DEBUG=1
-#export DEBUG=0
-
-cd python
-make
diff --git a/conda/environment.yml b/conda/environment.yml
index 2069acccdf..48cd8ddb33 100644
--- a/conda/environment.yml
+++ b/conda/environment.yml
@@ -3,11 +3,13 @@ channels:
- defaults
- conda-forge
dependencies:
- - python>=3.6
+ - python>=3.6,<3.12
- cffi>=1.11.0
- Pillow
- pybind11
+ - rust
- cmake-build-extension
+ - jq
- pip
- pip:
- qualname>=0.1.0
diff --git a/conda/flexflow-cpu.yml b/conda/flexflow-cpu.yml
deleted file mode 100644
index cc6fcf4667..0000000000
--- a/conda/flexflow-cpu.yml
+++ /dev/null
@@ -1,20 +0,0 @@
-name: flexflow
-channels:
- - defaults
- - conda-forge
-dependencies:
- - python>=3.6
- - cffi>=1.11.0
- - Pillow
- - pybind11
- - cmake-build-extension
- - pytest
- - pip
- - pip:
- - qualname>=0.1.0
- - keras_preprocessing>=1.1.2
- - numpy>=1.16.0
- - torch --index-url https://download.pytorch.org/whl/cpu
- - torchaudio --index-url https://download.pytorch.org/whl/cpu
- - torchvision --index-url https://download.pytorch.org/whl/cpu
- - requests
diff --git a/conda/flexflow.yml b/conda/flexflow.yml
new file mode 100644
index 0000000000..091ba929e4
--- /dev/null
+++ b/conda/flexflow.yml
@@ -0,0 +1,34 @@
+name: flexflow
+channels:
+ - defaults
+ - conda-forge
+dependencies:
+ - python>=3.6,<3.12
+ - cffi>=1.11.0
+ - Pillow
+ - pybind11
+ - rust
+ - cmake-build-extension
+ - jq
+ - pytest
+ - pip
+ - pip:
+ - qualname>=0.1.0
+ - keras_preprocessing>=1.1.2
+ - numpy>=1.16.0
+ - torch>=1.13.1 --index-url https://download.pytorch.org/whl/cpu
+ - torchaudio>=0.13.1 --index-url https://download.pytorch.org/whl/cpu
+ - torchvision>=0.14.1 --index-url https://download.pytorch.org/whl/cpu
+ - regex
+ - onnx
+ - transformers>=4.31.0
+ - sentencepiece
+ - einops
+ - requests
+ - scipy
+ - bitsandbytes
+ - datasets
+ - accelerate
+ - loralib
+ - triton
+ - peft
diff --git a/conda/meta.yaml b/conda/meta.yaml
deleted file mode 100644
index b6e14b2957..0000000000
--- a/conda/meta.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-package:
- name: flexflow
- version: "1.0"
-
-source:
- git_rev: master
- git_url: https://github.com/flexflow/FlexFlow.git
-
-build:
- number: 0
-
-requirements:
- build:
- - make
- - git
- - zlib
- - protobuf
- - {{ compiler('c') }}
- - {{ compiler('cxx') }}
- host:
- - python
- - cffi
- run:
- - cffi
- - numpy
- - python
- - zlib
- - keras-preprocessing
diff --git a/conda/pytorch-gpu.yml b/conda/pytorch-gpu.yml
index 677e71d73f..85d24ced17 100644
--- a/conda/pytorch-gpu.yml
+++ b/conda/pytorch-gpu.yml
@@ -3,7 +3,7 @@ channels:
- defaults
- conda-forge
dependencies:
- - python>=3.6
+ - python>=3.6,<3.12
- pip
- pip:
- numpy>=1.16.0
diff --git a/config/config.inc b/config/config.inc
index ebc6b9cb49..6431eaf136 100644
--- a/config/config.inc
+++ b/config/config.inc
@@ -24,7 +24,20 @@ fi
#set installation dir
if [ -n "$INSTALL_DIR" ]; then
- SET_INSTALL_DIR="-DCMAKE_INSTALL_PREFIX=${INSTALL_DIR}"
+ SET_INSTALL_DIR="-DINSTALL_DIR=${INSTALL_DIR}"
+fi
+
+if [ "$INFERENCE_TESTS" = "ON" ]; then
+ SET_INFERENCE_TESTS="-DINFERENCE_TESTS=ON"
+else
+ SET_INFERENCE_TESTS="-DINFERENCE_TESTS=OFF"
+fi
+
+#set cmake prefix path dir
+if [ -n "$LIBTORCH_PATH" ]; then
+ SET_LIBTORCH_PATH="-DLIBTORCH_PATH=${LIBTORCH_PATH}"
+else
+ SET_LIBTORCH_PATH=""
fi
# set build type
@@ -37,6 +50,11 @@ if [ -n "$FF_CUDA_ARCH" ]; then
SET_CUDA_ARCH="-DFF_CUDA_ARCH=${FF_CUDA_ARCH}"
fi
+# set HIP Arch
+if [ -n "$FF_HIP_ARCH" ]; then
+ SET_HIP_ARCH="-DFF_HIP_ARCH=${FF_HIP_ARCH}"
+fi
+
# set CUDA dir
if [ -n "$CUDA_DIR" ]; then
SET_CUDA="-DCUDA_PATH=${CUDA_DIR}"
@@ -44,11 +62,30 @@ if [ -n "$CUDA_DIR" ]; then
SET_CUDA_LIB_PATH="CUDA_PATH=${CUDA_PATH}"
fi
+# set cublas dir
+if [ -n "$CUBLAS_DIR" ]; then
+ SET_CUBLAS="-DCUBLAS_PATH=${CUBLAS_DIR}"
+fi
+
+# set curand dir
+if [ -n "$CURAND_DIR" ]; then
+ SET_CURAND="-DCURAND_PATH=${CURAND_DIR}"
+fi
+
# set cudnn dir
if [ -n "$CUDNN_DIR" ]; then
SET_CUDNN="-DCUDNN_PATH=${CUDNN_DIR}"
fi
+# build legion only
+if [ "$BUILD_LEGION_ONLY" = "ON" ]; then
+ SET_BUILD_LEGION_ONLY="-DBUILD_LEGION_ONLY=ON"
+elif [ "$BUILD_LEGION_ONLY" = "OFF" ]; then
+ SET_BUILD_LEGION_ONLY="-DBUILD_LEGION_ONLY=OFF"
+else
+ SET_BUILD_LEGION_ONLY="-DBUILD_LEGION_ONLY=OFF"
+fi
+
# enable Python
if [ "$FF_USE_PYTHON" = "ON" ]; then
SET_PYTHON="-DFF_USE_PYTHON=ON"
@@ -81,12 +118,13 @@ if [ "$FF_LEGION_NETWORKS" = "gasnet" ]; then
SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=mpi"
elif [ "$FF_GASNET_CONDUIT" = "udp" ]; then
SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=udp"
- elif [ "$FF_GASNET_CONDUIT" = "ucx" ]; then
- SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=ucx"
- SET_LEGION_NETWORKS+=" -DFF_UCX_URL=$FF_UCX_URL"
fi
elif [ "$FF_LEGION_NETWORKS" = "ucx" ]; then
SET_LEGION_NETWORKS+=" -DFF_LEGION_NETWORKS=ucx"
+ # set ucx dir
+ if [ -n "$UCX_DIR" ]; then
+ SET_UCX="-DUCX_PATH=${UCX_DIR}"
+ fi
fi
# build C++ examples
@@ -97,6 +135,13 @@ elif [ "$FF_BUILD_ALL_EXAMPLES" = "OFF" ]; then
else
SET_EXAMPLES="-DFF_BUILD_ALL_EXAMPLES=ON"
fi
+if [ "$FF_BUILD_ALL_INFERENCE_EXAMPLES" = "ON" ]; then
+ SET_INFERENCE_EXAMPLES="-DFF_BUILD_ALL_INFERENCE_EXAMPLES=ON"
+elif [ "$FF_BUILD_ALL_INFERENCE_EXAMPLES" = "OFF" ]; then
+ SET_INFERENCE_EXAMPLES="-DFF_BUILD_ALL_INFERENCE_EXAMPLES=OFF"
+else
+ SET_INFERENCE_EXAMPLES="-DFF_BUILD_ALL_INFERENCE_EXAMPLES=ON"
+fi
# enable C++ unit tests
if [ "$FF_BUILD_UNIT_TESTS" = "ON" ]; then
@@ -145,11 +190,18 @@ if [ -n "$FF_MAX_DIM" ]; then
SET_MAX_DIM="-DFF_MAX_DIM=${FF_MAX_DIM}"
fi
+#set LEGION_MAX_RETURN_SIZE
+if [ -n "$LEGION_MAX_RETURN_SIZE" ]; then
+ SET_LEGION_MAX_RETURN_SIZE="-DLEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE}"
+fi
+
# set ROCM path
if [ -n "$ROCM_PATH" ]; then
- SET_ROCM_PATH="-DROCM_PATH=${ROCM_PATH}"
+ SET_ROCM_PATH="-DROCM_PATH=${ROCM_PATH} -DHIP_ROOT_DIR=${ROCM_PATH}"
fi
+ADD_ROCM_TO_PATH=""
+
# set GPU backend
if [ -n "$FF_GPU_BACKEND" ]; then
SET_FF_GPU_BACKEND="-DFF_GPU_BACKEND=${FF_GPU_BACKEND}"
@@ -182,17 +234,18 @@ if [ -n "$FF_GPU_BACKEND" ]; then
chmod +x "$(pwd)/nvidia_hipcc"
SET_CXX="-DCMAKE_CXX_COMPILER=$(pwd)/nvidia_hipcc -DCMAKE_CXX_LINKER=$(pwd)/nvidia_hipcc"
else
- SET_CXX="-DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_CXX_LINKER=/opt/rocm/bin/hipcc"
+ ADD_ROCM_TO_PATH="PATH=${PATH}:${ROCM_PATH}/bin"
+ #SET_CXX="-DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_CXX_LINKER=/opt/rocm/bin/hipcc"
fi
fi
fi
fi
-CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUDNN} ${SET_PYTHON} ${SET_NCCL} ${SET_NCCL_DIR} ${SET_LEGION_NETWORKS} ${SET_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}"
+CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_INFERENCE_TESTS} ${SET_LIBTORCH_PATH} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUBLAS} ${SET_CURAND} ${SET_CUDNN} ${SET_HIP_ARCH} ${SET_PYTHON} ${SET_BUILD_LEGION_ONLY} ${SET_NCCL} ${SET_NCCL_DIR} ${SET_LEGION_NETWORKS} ${SET_UCX} ${SET_EXAMPLES} ${SET_INFERENCE_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_LEGION_MAX_RETURN_SIZE} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}"
function run_cmake() {
SRC_LOCATION=${SRC_LOCATION:=`dirname $0`/../}
-CMAKE_COMMAND="${SET_CC_FLAGS} ${SET_NVCC_FLAGS} ${SET_LD_FLAGS} ${SET_CUDA_LIB_PATH} cmake ${CMAKE_FLAGS} $* ${SRC_LOCATION}"
+CMAKE_COMMAND="${SET_CC_FLAGS} ${SET_NVCC_FLAGS} ${SET_LD_FLAGS} ${SET_CUDA_LIB_PATH} ${ADD_ROCM_TO_PATH} cmake ${CMAKE_FLAGS} $* ${SRC_LOCATION}"
echo $CMAKE_COMMAND
eval $CMAKE_COMMAND
}
diff --git a/config/config.linux b/config/config.linux
index 04908d81b2..acffc210f5 100755
--- a/config/config.linux
+++ b/config/config.linux
@@ -1,5 +1,4 @@
#!/bin/bash
-
# set the CC and CXX, usually it is not needed as cmake can detect it
# set CC and CXX to mpicc and mpic++ when enable gasnet
# CC=mpicc
@@ -11,24 +10,46 @@
#LD_FLAGS=${LD_FLAGS+=""}
#set install dir
-#INSTALL_DIR=
+INSTALL_DIR=${INSTALL_DIR:-}
# set build type
BUILD_TYPE=${BUILD_TYPE:-Release}
+INFERENCE_TESTS=${INFERENCE_TESTS:-OFF}
+LIBTORCH_PATH=${LIBTORCH_PATH:-"$(realpath ../..)/libtorch"}
+if [[ "$INFERENCE_TESTS" == "ON" && ! -d "$LIBTORCH_PATH" ]]; then
+ cwd="$(pwd)"
+ cd ../..
+ wget https://download.pytorch.org/libtorch/nightly/cpu/libtorch-shared-with-deps-latest.zip
+ unzip libtorch-shared-with-deps-latest.zip
+ rm libtorch-shared-with-deps-latest.zip
+ LIBTORCH_PATH="$(pwd)/libtorch"
+ cd "$cwd"
+fi
+
# set CUDA Arch to the desired GPU architecture(s) to target (e.g. pass "FF_CUDA_ARCH=60" for Pascal).
# To pass more than one value, separate architecture numbers with a comma (e.g. FF_CUDA_ARCH=70,75).
# Alternatively, set "FF_CUDA_ARCH=autodetect" to build FlexFlow for all architectures detected on the machine,
# or set "FF_CUDA_ARCH=all" to build FlexFlow for all supported GPU architectures
FF_CUDA_ARCH=${FF_CUDA_ARCH:-"autodetect"}
-
-# set CUDNN dir in case cmake cannot autodetect a path
-CUDNN_DIR=${CUDNN_DIR:-"/usr/local/cuda"}
+# FF_HIP_ARCH only supports building for a specific AMD architecture, a list of architectures separated by a comma
+# or all available architectures. TODO: support autodetect
+FF_HIP_ARCH=${FF_HIP_ARCH:-"all"}
# set CUDA dir in case cmake cannot autodetect a path
CUDA_DIR=${CUDA_DIR:-"/usr/local/cuda"}
-#set NCCL dir
+# set CUBLAS dir in case it is not stored in the CUDA DIR
+CUBLAS_DIR=${CUBLAS_DIR:-"/usr/local/cuda"}
+
+# set CURAND dir in case it is not stored in the CUDA DIR
+CURAND_DIR=${CURAND_DIR:-"/usr/local/cuda"}
+
+# set CUDNN dir in case cmake cannot autodetect a path
+CUDNN_DIR=${CUDNN_DIR:-"/usr/local/cuda"}
+
+# if not use PREBUILD_NCCL, you can set NCCL_DIR to use external nccl lib,
+# otherwise, we will build nccl from source
NCCL_DIR=${NCCL_DIR:-"/usr/local/cuda"}
# enable Python
@@ -40,11 +61,12 @@ FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS:-}
# select GASNET conduit
FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ibv}
-# set UCX URL
-FF_UCX_URL=${FF_UCX_URL:-""}
+# set UCX dir if Legion networks is set to ucx
+UCX_DIR=${UCX_DIR:-""}
# build C++ examples
FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES:-OFF}
+FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES:-ON}
# build C++ unit tests
FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS:-OFF}
@@ -52,6 +74,7 @@ FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS:-OFF}
# use precompiled NCCL and Legion libraries, where available
FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL:-OFF}
FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION:-OFF}
+
# use the flag below to use both the NCCL and Legion pre-built libraries.
# when the flag below is set to ON, the two flags above are ignored.
FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES:-OFF}
@@ -62,6 +85,12 @@ FF_USE_AVX2=${FF_USE_AVX2:-OFF}
# set MAX_DIM
FF_MAX_DIM=${FF_MAX_DIM:-5}
+# set BUILD_LEGION_ONLY
+BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY:-OFF}
+
+# set LEGION_MAX_RETURN_SIZE
+LEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE:-262144}
+
# set ROCM path
ROCM_PATH=${ROCM_PATH:-"/opt/rocm"}
@@ -70,7 +99,7 @@ FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda}
if [[ "${FF_GPU_BACKEND}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then
echo "Error, value of FF_GPU_BACKEND (${FF_GPU_BACKEND}) is invalid."
exit 1
-elif [[ "$FF_GPU_BACKEND" == "cuda" || "$FF_GPU_BACKEND" = "hip_cuda" ]]; then
+elif [[ "$FF_GPU_BACKEND" == "cuda" || "$FF_GPU_BACKEND" = "hip_cuda" || "$FF_GPU_BACKEND" == "hip_rocm" ]]; then
# enable NCCL
FF_USE_NCCL=${FF_USE_NCCL:-ON}
else
@@ -79,7 +108,7 @@ fi
function get_build_configs() {
# Create a string with the values of the variables set in this script
- BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} CUDNN_DIR=${CUDNN_DIR} CUDA_DIR=${CUDA_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} FF_UCX_URL=${FF_UCX_URL} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND}"
+ BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDA_DIR=${CUDA_DIR} CUDNN_DIR=${CUDNN_DIR} CUBLAS_DIR=${CUBLAS_DIR} CURAND_DIR=${CURAND_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}"
}
if [[ -n "$1" && ( "$1" == "CMAKE_FLAGS" || "$1" == "CUDA_PATH" ) ]]; then
diff --git a/deps/legion b/deps/legion
index 626b55689c..0d32b35542 160000
--- a/deps/legion
+++ b/deps/legion
@@ -1 +1 @@
-Subproject commit 626b55689c77848b246e1da19678c7ad58899f0c
+Subproject commit 0d32b35542bc0e9aba5950e485b8fc3413ae664b
diff --git a/deps/tokenizers-cpp b/deps/tokenizers-cpp
new file mode 160000
index 0000000000..c0fab1e14a
--- /dev/null
+++ b/deps/tokenizers-cpp
@@ -0,0 +1 @@
+Subproject commit c0fab1e14a9421c1501acee5b7703e5dafa60479
diff --git a/docker/README.md b/docker/README.md
index 916b78acf6..010aadf762 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -2,50 +2,61 @@
This folder contains the Dockerfiles and scripts that you can use to quickly run FlexFlow with no manual installation required. To use the containers, follow the steps below.
## Prerequisites
-You will need a machine with a NVIDIA GPU, with drivers installed. You will also need to have Docker and the [Nvidia Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#getting-started) installed on the host machine.
+You can build and run the FlexFlow Docker images on any machine, but if you want to train or serve a model, you will need a machine with a NVIDIA or AMD GPU, with drivers installed. You will also need to have Docker and the [Nvidia Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#getting-started) installed on the host machine. If using an AMD GPU, follow the [Deploy ROCm Docker containers](https://rocm.docs.amd.com/en/latest/deploy/docker.html) instructions.
## Downloading a pre-built package
The fastest way to run FlexFlow is to use one of the pre-built containers, which we update for each commit to the `inference` branch (the `inference` branch is currently ahead of the `master` branch). The available containers are the following, and can be found [at this link](https://github.com/orgs/flexflow/packages?repo_name=FlexFlow):
-* `flexflow`: the pre-built version of FlexFlow. We currently publish one version targeting GPUs with a `hip_rocm` backend (`flexflow-hip_rocm`), and several versions for CUDA GPUs (one for each of the following CUDA versions 11.1, 11.2, 11.3, 11.5, 11.6, 11.7, and 11.8). The CUDA images are named `flexflow-cuda-`, e.g. [flexflow-cuda-11.8](https://github.com/orgs/flexflow/packages/container/package/flexflow-cuda-11.8)
-* `flexflow-environment`: this is the base layer for `flexflow`. The packages are used in CI or for internal use, and contain all the dependencies needed to build/run Flexflow. You may find them useful if you want to build FlexFlow yourself. We also publish one version of `flexflow-environment` for `hip_rocm` and one for each CUDA version in the list above. The naming convention is similar, too. For example, the `flexflow-environment` image for CUDA 11.8 is tagged [flexflow-environment-cuda-11.8](https://github.com/orgs/flexflow/packages/container/package/flexflow-environment-cuda-11.8).
+* `flexflow`: the pre-built version of FlexFlow. We currently publish four version targeting AMD GPUs (ROCm versions: 5.3, 5.4, 5.5 and 5.6 ), and several versions for CUDA GPUs (CUDA versions: 11.1, 11.6, 11.7, 11.8, 12.0, 12.1, and 12.2). The CUDA images are named `flexflow--`, e.g. [flexflow-hip_rocm-5.6](https://github.com/flexflow/FlexFlow/pkgs/container/flexflow-hip_rocm-5.6) or [flexflow-cuda-12.0](https://github.com/orgs/flexflow/packages/container/package/flexflow-cuda-12.0) or
+* `flexflow-environment`: this is the base layer for `flexflow`. The packages are used in CI or for internal use, and contain all the dependencies needed to build/run Flexflow. You may find them useful if you want to build FlexFlow yourself. We also publish four version of `flexflow-environment` for AMD GPUs and, for NVIDIA GPUs, one for each CUDA version in the list above. The naming convention is similar, too. For example, the `flexflow-environment` image for CUDA 12.0 is tagged [flexflow-environment-cuda-12.0](https://github.com/orgs/flexflow/packages/container/package/flexflow-environment-cuda-12.0).
The easiest way to download any of the Docker containers above is to call:
```
-FF_GPU_BACKEND= cuda_version= ./docker/pull.sh
+./docker/pull.sh
```
-where `CONTAINER_NAME` is `flexflow` (or `flexflow-environment`), and `FF_GPU_BACKEND`/`cuda_version` are optional environment variables you can use if you wish to download the docker image for a GPU backend and/or cuda version other than those installed on your machine (leaving these variables unset will let the script autodetect which version to download depending on your setup).
+where `CONTAINER_NAME` is `flexflow` (or `flexflow-environment`). By default, the script will assume a NVIDIA backend and attempt to detect the CUDA version on your machine, to download the relevant container. If your machine has AMD GPUs, or no GPUs, or if you want to specify the CUDA/ROCM version to download, set the environment variables below:
+
+* `FF_GPU_BACKEND` (supported options: `cuda`, `hip_rocm`) to specify the GPU backend of the Docker container to be downloaded.
+* `cuda_version` (supported options: 11.1, 11.6, 11.7, 11.8, 12.0, 12.1 and 12.2) to specify the CUDA version, when using a `cuda` backend. If `FF_GPU_BACKEND` is set to `hip_rocm`, the `cuda_version` env will be ignored
+* `hip_version` (supported options: 5.3, 5.4, 5.5, 5.6) to specify the ROCm version, when using a HIP backend. If `FF_GPU_BACKEND` is set to `cuda`, the `hip_version` env will be ignored.
+
After downloading a container you can use the `run.sh` script to run it by following the instructions in the section below.
## Building a Docker container from scratch
-If you prefer to build one of the Docker containers from scratch, you can do so with the help of the `build.sh` script. You can configure the build via the same environment variables that you'd use to configure a CMake build (refer to the [Installation guide](../INSTALL.md) and to the `config/config.linux` file). For example, to build for a CUDA backend, you can export `FF_GPU_BACKEND=cuda` (you can also omit this since `cuda` is the default value for `FF_GPU_BACKEND`). When building for the `cuda` backend, you can pick the CUDA version by setting the optional environment variable `cuda_version`, e.g.: `export cuda_version=11.8`. Leaving the `cuda_version` variable blank will let the script autodetect the CUDA version installed on the host machine, and build for that version. Setting the `cuda_version` env will have no effect when building for a GPU backend other than CUDA.
+If you prefer to build one of the Docker containers from scratch, you can do so with the help of the `build.sh` script. You can configure the build via the same environment variables that you'd use to configure a CMake build (refer to the [Installation guide](https://flexflow.readthedocs.io/en/latest/installation.html) and to the `config/config.linux` file). For example, to build for a CUDA backend, you can export `FF_GPU_BACKEND=cuda` (you can also omit this since `cuda` is the default value for `FF_GPU_BACKEND`). When building for the `cuda` backend, you can pick the CUDA version by setting the optional environment variable `cuda_version`, e.g.: `export cuda_version=12.0`. Leaving the `cuda_version` variable blank will let the script autodetect the CUDA version installed on the host machine, and build for that version. Setting the `cuda_version` env will have no effect when building for a GPU backend other than CUDA. Similarly, you can pick the ROCm version by setting `hip_version` when the backend is `FF_GPU_BACKEND=hip_rocm`, whereas the env will be ignored for non-HIP backends.
To build the FlexFlow container, run (the `flexflow` argument of the build script can be omitted):
```
-FF_GPU_BACKEND= cuda_version= ./docker/build.sh flexflow
+./docker/build.sh flexflow
```
If you only want to build the `flexflow-environment` image (the base layers of the `flexflow` container, used in CI and for other internal purposes), run:
```
-FF_GPU_BACKEND= cuda_version= ./docker/build.sh flexflow-environment
+./docker/build.sh flexflow-environment
```
## Running a Docker container
-After having either built or downloaded a Docker container by following the instructions above, you can run it with the following command (image name argument of the run script can be omitted). Once again, you can set the `FF_GPU_BACKEND` and `cuda_version` optional environment variables to run the docker image with the desired GPU backend and CUDA version. Leaving these variables unset will instruct the script to autodetect the GPU backend and CUDA version installed on the current machine and run the Docker container with it if available.
+After having either built or downloaded a Docker container by following the instructions above, you can run it with the following command (image name argument of the run script can be omitted). Once again, you can set the `FF_GPU_BACKEND`, `cuda_version` and `hip_version` optional environment variables to run the docker image with the desired GPU backend and CUDA/HIP version:
+
+* `FF_GPU_BACKEND` (supported options: `cuda`, `hip_rocm`) to specify the GPU backend of the Docker container to be run.
+* `cuda_version` (supported options: 11.1, 11.6, 11.7, 11.8, 12.0, 12.1, 12.2) to specify the CUDA version, when using a `cuda` backend. If `FF_GPU_BACKEND` is set to `hip_rocm`, the `cuda_version` env will be ignored
+* `hip_version` (supported options: 5.3, 5.4, 5.5, 5.6) to specify the ROCm version, when using a HIP backend. If `FF_GPU_BACKEND` is set to `cuda`, the `hip_version` env will be ignored.
+
+Leaving these variables unset will assume a GPU backend, and instruct the script to autodetect the CUDA version installed on the current machine and run the Docker container with it if available.
```
-FF_GPU_BACKEND= cuda_version= ./docker/run.sh --image_name flexflow
+./docker/run.sh --image_name flexflow
```
If you wish to run the `flexflow-environment` container, run:
```
-FF_GPU_BACKEND= cuda_version= ./docker/run.sh --image_name flexflow-environment
+./docker/run.sh --image_name flexflow-environment
```
N.B.: If you don't have GPUs available on the machine, or you wish to run the docker image without attaching GPUs, you can set the environment variable `ATTACH_GPUS=false` before running the script.
diff --git a/docker/build.sh b/docker/build.sh
index 6ed5cbe00e..b68860712f 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -2,7 +2,7 @@
set -euo pipefail
# Usage: ./build.sh
-# Optional environment variables: FF_GPU_BACKEND, cuda_version
+# Optional environment variables: FF_GPU_BACKEND, cuda_version, hip_version
# Cd into $FF_HOME. Assumes this script is in $FF_HOME/docker
cd "${BASH_SOURCE[0]%/*}/.."
@@ -11,6 +11,8 @@ cd "${BASH_SOURCE[0]%/*}/.."
image=${1:-flexflow}
FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda}
cuda_version=${cuda_version:-"empty"}
+hip_version=${hip_version:-"empty"}
+python_version=${python_version:-latest}
# Check docker image name
if [[ "$image" != @(flexflow-environment|flexflow) ]]; then
@@ -28,52 +30,97 @@ else
echo "Building $image docker image with default GPU backend: cuda"
fi
+# base image to use when building the flexflow environment docker image.
+ff_environment_base_image="ubuntu:20.04"
+# gpu backend version suffix for the docker image.
+gpu_backend_version=""
+
if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then
# Autodetect cuda version if not specified
if [[ $cuda_version == "empty" ]]; then
- cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}')
+ # shellcheck disable=SC2015
+ cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}' || true)
# Change cuda_version eg. V11.7.99 to 11.7
cuda_version=${cuda_version:1:4}
+ if [[ -z "$cuda_version" ]]; then
+ echo "Could not detect CUDA version. Please specify one manually by setting the 'cuda_version' env."
+ exit 1
+ fi
fi
# Check that CUDA version is supported, and modify cuda version to include default subsubversion
- if [[ "$cuda_version" == @(11.1|11.3|11.7) ]]; then
+ if [[ "$cuda_version" == @(11.1|11.3|11.7|12.0|12.1) ]]; then
cuda_version_input=${cuda_version}.1
- elif [[ "$cuda_version" == @(11.2|11.5|11.6) ]]; then
+ elif [[ "$cuda_version" == @(11.2|11.5|11.6|12.2) ]]; then
cuda_version_input=${cuda_version}.2
+ elif [[ "$cuda_version" == @(11.4) ]]; then
+ cuda_version_input=${cuda_version}.3
elif [[ "$cuda_version" == @(11.8) ]]; then
cuda_version_input=${cuda_version}.0
+ elif [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
+ # Use CUDA 12.2 for all versions greater or equal to 12.2 for now (the Docker machine with CUDNN is not yet available)
+ cuda_version=12.2
+ cuda_version_input=${cuda_version}.2
else
- echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.5|11.6|11.7|11.8}"
+ echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}"
exit 1
fi
- # Set cuda version suffix to docker image name
echo "Building $image docker image with CUDA $cuda_version"
- cuda_version="-${cuda_version}"
-else
- # Empty cuda version suffix for non-CUDA images
- cuda_version=""
- # Pick a default CUDA version for the base docker image from NVIDIA
- cuda_version_input="11.8.0"
+ ff_environment_base_image="nvidia/cuda:${cuda_version_input}-cudnn8-devel-ubuntu20.04"
+ gpu_backend_version="-${cuda_version}"
fi
-docker build --build-arg "FF_GPU_BACKEND=${FF_GPU_BACKEND}" --build-arg "cuda_version=${cuda_version_input}" -t "flexflow-environment-${FF_GPU_BACKEND}${cuda_version}" -f docker/flexflow-environment/Dockerfile .
+if [[ "${FF_GPU_BACKEND}" == "hip_rocm" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then
+ # Autodetect HIP version if not specified
+ if [[ $hip_version == "empty" ]]; then
+ # shellcheck disable=SC2015
+ hip_version=$(command -v hipcc >/dev/null 2>&1 && hipcc --version | grep "HIP version:" | awk '{print $NF}' || true)
+ # Change hip_version eg. 5.6.31061-8c743ae5d to 5.6
+ hip_version=${hip_version:0:3}
+ if [[ -z "$hip_version" ]]; then
+ echo "Could not detect HIP version. Please specify one manually by setting the 'hip_version' env."
+ exit 1
+ fi
+ fi
+ # Check that HIP version is supported
+ if [[ "$hip_version" != @(5.3|5.4|5.5|5.6) ]]; then
+ echo "hip_version is not supported, please choose among {5.3, 5.4, 5.5, 5.6}"
+ exit 1
+ fi
+ echo "Building $image docker image with HIP $hip_version"
+ if [[ "${FF_GPU_BACKEND}" == "hip_rocm" ]]; then
+ gpu_backend_version="-${hip_version}"
+ fi
+fi
+
+# Get number of cores available on the machine. Build with all cores but one, to prevent RAM choking
+cores_available=$(nproc --all)
+n_build_cores=$(( cores_available -1 ))
+
+# check python_version
+if [[ "$python_version" != @(3.8|3.9|3.10|3.11|latest) ]]; then
+ echo "python_version not supported!"
+ exit 0
+fi
+
+docker build --build-arg "ff_environment_base_image=${ff_environment_base_image}" --build-arg "N_BUILD_CORES=${n_build_cores}" --build-arg "FF_GPU_BACKEND=${FF_GPU_BACKEND}" --build-arg "hip_version=${hip_version}" --build-arg "python_version=${python_version}" -t "flexflow-environment-${FF_GPU_BACKEND}${gpu_backend_version}" -f docker/flexflow-environment/Dockerfile .
# If the user only wants to build the environment image, we are done
if [[ "$image" == "flexflow-environment" ]]; then
exit 0
fi
-# Gather arguments needed to build the FlexFlow image
-# Get number of cores available on the machine. Build with all cores but one, to prevent RAM choking
-cores_available=$(nproc --all)
-n_build_cores=$(( cores_available -1 ))
+# Done with flexflow-environment image
-# If FF_CUDA_ARCH is set to autodetect, we need to perform the autodetection here because the Docker
-# image will not have access to GPUs during the build phase (due to a Docker restriction). In all other
-# cases, we pass the value of FF_CUDA_ARCH directly to Cmake.
-if [[ "${FF_CUDA_ARCH:-autodetect}" == "autodetect" ]]; then
- # Get CUDA architecture(s), if GPUs are available
- cat << EOF > ./get_gpu_arch.cu
+###########################################################################################
+
+# Build flexflow image if requested
+if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then
+ # If FF_CUDA_ARCH is set to autodetect, we need to perform the autodetection here because the Docker
+ # image will not have access to GPUs during the build phase (due to a Docker restriction). In all other
+ # cases, we pass the value of FF_CUDA_ARCH directly to Cmake.
+ if [[ "${FF_CUDA_ARCH:-autodetect}" == "autodetect" ]]; then
+ # Get CUDA architecture(s), if GPUs are available
+ cat << EOF > ./get_gpu_arch.cu
#include
int main() {
int count = 0;
@@ -87,24 +134,25 @@ int main() {
return 0;
}
EOF
- gpu_arch_codes=""
- if command -v nvcc &> /dev/null
- then
- nvcc ./get_gpu_arch.cu -o ./get_gpu_arch
- gpu_arch_codes="$(./get_gpu_arch)"
- fi
- gpu_arch_codes="$(echo "$gpu_arch_codes" | xargs -n1 | sort -u | xargs)"
- gpu_arch_codes="${gpu_arch_codes// /,}"
- rm -f ./get_gpu_arch.cu ./get_gpu_arch
-
- if [[ -n "$gpu_arch_codes" ]]; then
- echo "Host machine has GPUs with architecture codes: $gpu_arch_codes"
- echo "Configuring FlexFlow to build for the $gpu_arch_codes code(s)."
- FF_CUDA_ARCH="${gpu_arch_codes}"
- export FF_CUDA_ARCH
- else
- echo "FF_CUDA_ARCH is set to 'autodetect', but the host machine does not have any compatible GPUs."
- exit 1
+ gpu_arch_codes=""
+ if command -v nvcc &> /dev/null
+ then
+ nvcc ./get_gpu_arch.cu -o ./get_gpu_arch
+ gpu_arch_codes="$(./get_gpu_arch)"
+ fi
+ gpu_arch_codes="$(echo "$gpu_arch_codes" | xargs -n1 | sort -u | xargs)"
+ gpu_arch_codes="${gpu_arch_codes// /,}"
+ rm -f ./get_gpu_arch.cu ./get_gpu_arch
+
+ if [[ -n "$gpu_arch_codes" ]]; then
+ echo "Host machine has GPUs with architecture codes: $gpu_arch_codes"
+ echo "Configuring FlexFlow to build for the $gpu_arch_codes code(s)."
+ FF_CUDA_ARCH="${gpu_arch_codes}"
+ export FF_CUDA_ARCH
+ else
+ echo "FF_CUDA_ARCH is set to 'autodetect', but the host machine does not have any compatible GPUs."
+ exit 1
+ fi
fi
fi
@@ -114,4 +162,4 @@ fi
# Set value of BUILD_CONFIGS
get_build_configs
-docker build --build-arg "N_BUILD_CORES=${n_build_cores}" --build-arg "FF_GPU_BACKEND=${FF_GPU_BACKEND}" --build-arg "BUILD_CONFIGS=${BUILD_CONFIGS}" --build-arg "cuda_version=${cuda_version}" -t "flexflow-${FF_GPU_BACKEND}${cuda_version}" -f docker/flexflow/Dockerfile .
+docker build --build-arg "N_BUILD_CORES=${n_build_cores}" --build-arg "FF_GPU_BACKEND=${FF_GPU_BACKEND}" --build-arg "BUILD_CONFIGS=${BUILD_CONFIGS}" --build-arg "gpu_backend_version=${gpu_backend_version}" -t "flexflow-${FF_GPU_BACKEND}${gpu_backend_version}" -f docker/flexflow/Dockerfile .
diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile
index 50497197c9..3434916d6b 100644
--- a/docker/flexflow-environment/Dockerfile
+++ b/docker/flexflow-environment/Dockerfile
@@ -1,11 +1,11 @@
-ARG cuda_version
-FROM nvidia/cuda:${cuda_version}-cudnn8-devel-ubuntu20.04
+ARG ff_environment_base_image
+FROM ${ff_environment_base_image}
LABEL org.opencontainers.image.source=https://github.com/flexflow/FlexFlow
LABEL org.opencontainers.image.description="FlexFlow environment container"
# Install basic dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends wget sudo binutils git zlib1g-dev lsb-release nano libhdf5-dev && \
+RUN apt-get update && apt-get install -y --no-install-recommends wget sudo binutils git zlib1g-dev lsb-release nano gdb libhdf5-dev jq && \
rm -rf /var/lib/apt/lists/* /etc/apt/sources.list.d/cuda.list /etc/apt/sources.list.d/nvidia-ml.list && \
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends software-properties-common && \
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends build-essential apt-utils \
@@ -16,14 +16,31 @@ RUN apt-get update && apt-get install -y --no-install-recommends wget sudo binut
apt-get upgrade -y libstdc++6
# Install Python3 with Miniconda
-RUN wget -c -q https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
- mv Miniconda3-latest-Linux-x86_64.sh ~/Miniconda3-latest-Linux-x86_64.sh && \
- chmod +x ~/Miniconda3-latest-Linux-x86_64.sh && \
- bash ~/Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda && \
- rm ~/Miniconda3-latest-Linux-x86_64.sh && \
- /opt/conda/bin/conda upgrade --all && \
- /opt/conda/bin/conda install conda-build conda-verify && \
- /opt/conda/bin/conda clean -ya
+ARG python_version "latest"
+#RUN MINICONDA_SCRIPT_NAME=Miniconda3-latest-Linux-x86_64.sh; \
+RUN MINICONDA_SCRIPT_NAME=Miniconda3-py311_23.5.2-0-Linux-x86_64.sh; \
+ if [ "$python_version" != "3.8" ] && [ "$python_version" != "3.9" ] && [ "$python_version" != "3.10" ] && [ "$python_version" != "3.11" ] && [ "$python_version" != "latest" ]; then \
+ echo "python_version '${python_version}' is not supported, please choose among {3.8, 3.9, 3.10, 3.11 or latest (default)}"; \
+ exit 1; \
+ fi; \
+ if [ "${python_version}" = "3.8" ]; then \
+ MINICONDA_SCRIPT_NAME=Miniconda3-py38_23.5.2-0-Linux-x86_64.sh; \
+ elif [ "${python_version}" = "3.9" ]; then \
+ MINICONDA_SCRIPT_NAME=Miniconda3-py39_23.5.2-0-Linux-x86_64.sh; \
+ elif [ "${python_version}" = "3.10" ]; then \
+ MINICONDA_SCRIPT_NAME=Miniconda3-py310_23.5.2-0-Linux-x86_64.sh; \
+ elif [ "${python_version}" = "3.11" ]; then \
+ MINICONDA_SCRIPT_NAME=Miniconda3-py311_23.5.2-0-Linux-x86_64.sh; \
+ fi; \
+ wget -c -q https://repo.continuum.io/miniconda/${MINICONDA_SCRIPT_NAME} && \
+ mv ./${MINICONDA_SCRIPT_NAME} ~/${MINICONDA_SCRIPT_NAME} && \
+ chmod +x ~/${MINICONDA_SCRIPT_NAME} && \
+ bash ~/${MINICONDA_SCRIPT_NAME} -b -p /opt/conda && \
+ rm ~/${MINICONDA_SCRIPT_NAME} && \
+ /opt/conda/bin/conda config --set solver classic && \
+ /opt/conda/bin/conda upgrade --all && \
+ /opt/conda/bin/conda install conda-build conda-verify && \
+ /opt/conda/bin/conda clean -ya
# Optionally install HIP dependencies
# Note that amd's docs say to also install the `hip-runtime-nvidia` package. This
@@ -31,13 +48,36 @@ RUN wget -c -q https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_6
# in the container. It also attempts to install packages for a graphical install.
# For our container, we don't need `hip-runtime-nvidia`
ARG FF_GPU_BACKEND "cuda"
+ARG hip_version "5.6"
+ARG N_BUILD_CORES
+# set MAKEFLAGS to speedup any dependency that uses make
+ENV MAKEFLAGS "${MAKEFLAGS} -j${N_BUILD_CORES}"
+
RUN if [ "$FF_GPU_BACKEND" = "hip_cuda" ] || [ "$FF_GPU_BACKEND" = "hip_rocm" ]; then \
echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Installing HIP dependencies"; \
- wget https://repo.radeon.com/amdgpu-install/22.20.5/ubuntu/bionic/amdgpu-install_22.20.50205-1_all.deb; \
- apt-get install -y ./amdgpu-install_22.20.50205-1_all.deb; \
- rm ./amdgpu-install_22.20.50205-1_all.deb; \
+ # Check that hip_version is one of 5.3,5.4,5.5,5.6
+ if [ "$hip_version" != "5.3" ] && [ "$hip_version" != "5.4" ] && [ "$hip_version" != "5.5" ] && [ "$hip_version" != "5.6" ]; then \
+ echo "hip_version '${hip_version}' is not supported, please choose among {5.3, 5.4, 5.5, 5.6}"; \
+ exit 1; \
+ fi; \
+ # Compute script name and url given the version
+ AMD_GPU_SCRIPT_NAME=amdgpu-install_5.6.50600-1_all.deb; \
+ if [ "$hip_version" = "5.3" ]; then \
+ AMD_GPU_SCRIPT_NAME=amdgpu-install_5.3.50300-1_all.deb; \
+ elif [ "$hip_version" = "5.4" ]; then \
+ AMD_GPU_SCRIPT_NAME=amdgpu-install_5.4.50400-1_all.deb; \
+ elif [ "$hip_version" = "5.5" ]; then \
+ AMD_GPU_SCRIPT_NAME=amdgpu-install_5.5.50500-1_all.deb; \
+ fi; \
+ AMD_GPU_SCRIPT_URL="https://repo.radeon.com/amdgpu-install/${hip_version}/ubuntu/focal/${AMD_GPU_SCRIPT_NAME}"; \
+ # Download and install AMD GPU software with ROCM and HIP support
+ wget $AMD_GPU_SCRIPT_URL; \
+ apt-get install -y ./${AMD_GPU_SCRIPT_NAME}; \
+ rm ./${AMD_GPU_SCRIPT_NAME}; \
amdgpu-install -y --usecase=hip,rocm --no-dkms; \
- apt-get install -y hip-dev hipblas miopen-hip rocm-hip-sdk; \
+ apt-get install -y hip-dev hipblas miopen-hip rocm-hip-sdk rocm-device-libs; \
+ # Install protobuf dependencies
+ apt-get update -y && sudo apt-get install -y pkg-config zip g++ zlib1g-dev autoconf automake libtool make; \
else \
echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping installing HIP dependencies"; \
fi
@@ -51,8 +91,14 @@ ENV CUDA_DIR /usr/local/cuda
# Install python packages and other dependencies
RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind11 numpy pandas keras-preprocessing
# Install CPU-only Pytorch and related dependencies
-RUN conda install pytorch torchvision torchaudio cpuonly -c pytorch
-RUN conda install -c conda-forge onnx transformers sentencepiece
-RUN pip3 install tensorflow
+RUN conda install pytorch torchvision torchaudio -c pytorch
+RUN conda install -c conda-forge onnx transformers>=4.31.0 sentencepiece einops
+RUN pip3 install tensorflow notebook
+# PEFT-related
+RUN pip3 install scipy bitsandbytes datasets accelerate loralib triton peft
+
+# Install Rust
+RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
+ENV PATH /root/.cargo/bin:$PATH
ENTRYPOINT ["/bin/bash"]
diff --git a/docker/flexflow/Dockerfile b/docker/flexflow/Dockerfile
index 0cda5cbc18..60f9d4d653 100644
--- a/docker/flexflow/Dockerfile
+++ b/docker/flexflow/Dockerfile
@@ -1,6 +1,6 @@
ARG FF_GPU_BACKEND "cuda"
-ARG cuda_version ""
-FROM flexflow-environment-$FF_GPU_BACKEND$cuda_version:latest
+ARG gpu_backend_version ""
+FROM flexflow-environment-$FF_GPU_BACKEND$gpu_backend_version:latest
LABEL org.opencontainers.image.source=https://github.com/flexflow/FlexFlow
LABEL org.opencontainers.image.description="FlexFlow container"
@@ -15,6 +15,15 @@ COPY . .
ARG BUILD_CONFIGS
ARG N_BUILD_CORES
+# Create install directory if needed
+RUN for pair in $BUILD_CONFIGS; do \
+ key=${pair%%=*}; \
+ value=${pair#*=}; \
+ if [ "$key" = "INSTALL_DIR" ] && [ -n "$value" ]; then \
+ mkdir -p "$value"; \
+ fi; \
+ done
+
# Build and install C++ and Python versions of FlexFlow
RUN mkdir -p build && cd build && \
eval "$BUILD_CONFIGS" ../config/config.linux && \
diff --git a/docker/publish.sh b/docker/publish.sh
index b8668d3c0e..c70419a9cc 100755
--- a/docker/publish.sh
+++ b/docker/publish.sh
@@ -2,7 +2,7 @@
set -euo pipefail
# Usage: ./publish.sh
-# Optional environment variables: FF_GPU_BACKEND, cuda_version
+# Optional environment variables: FF_GPU_BACKEND, cuda_version, hip_version
# Cd into directory holding this script
cd "${BASH_SOURCE[0]%/*}"
@@ -11,6 +11,7 @@ cd "${BASH_SOURCE[0]%/*}"
image=${1:-flexflow}
FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda}
cuda_version=${cuda_version:-"empty"}
+hip_version=${hip_version:-"empty"}
# Check docker image name
if [[ "${image}" != @(flexflow-environment|flexflow) ]]; then
@@ -18,6 +19,9 @@ if [[ "${image}" != @(flexflow-environment|flexflow) ]]; then
exit 1
fi
+# gpu backend version suffix for the docker image.
+gpu_backend_version=""
+
# Check GPU backend
if [[ "${FF_GPU_BACKEND}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then
echo "Error, value of FF_GPU_BACKEND (${FF_GPU_BACKEND}) is invalid. Pick between 'cuda', 'hip_cuda', 'hip_rocm' or 'intel'."
@@ -31,25 +35,50 @@ fi
if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then
# Autodetect cuda version if not specified
if [[ $cuda_version == "empty" ]]; then
- cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}')
+ # shellcheck disable=SC2015
+ cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}' || true)
# Change cuda_version eg. V11.7.99 to 11.7
cuda_version=${cuda_version:1:4}
+ if [[ -z "$cuda_version" ]]; then
+ echo "Could not detect CUDA version. Please specify one manually by setting the 'cuda_version' env."
+ exit 1
+ fi
fi
# Check that CUDA version is supported
- if [[ "$cuda_version" != @(11.1|11.3|11.7|11.2|11.5|11.6|11.8) ]]; then
- echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.5|11.6|11.7|11.8}"
+ if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2) ]]; then
+ echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}"
exit 1
fi
# Set cuda version suffix to docker image name
echo "Publishing $image docker image with CUDA $cuda_version"
- cuda_version="-${cuda_version}"
-else
- # Empty cuda version suffix for non-CUDA images
- cuda_version=""
+ gpu_backend_version="-${cuda_version}"
+fi
+
+if [[ "${FF_GPU_BACKEND}" == "hip_rocm" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then
+ # Autodetect HIP version if not specified
+ if [[ $hip_version == "empty" ]]; then
+ # shellcheck disable=SC2015
+ hip_version=$(command -v hipcc >/dev/null 2>&1 && hipcc --version | grep "HIP version:" | awk '{print $NF}' || true)
+ # Change hip_version eg. 5.6.31061-8c743ae5d to 5.6
+ hip_version=${hip_version:0:3}
+ if [[ -z "$hip_version" ]]; then
+ echo "Could not detect HIP version. Please specify one manually by setting the 'hip_version' env."
+ exit 1
+ fi
+ fi
+ # Check that HIP version is supported
+ if [[ "$hip_version" != @(5.3|5.4|5.5|5.6) ]]; then
+ echo "hip_version is not supported, please choose among {5.3, 5.4, 5.5, 5.6}"
+ exit 1
+ fi
+ echo "Pubilishing $image docker image with HIP $hip_version"
+ if [[ "${FF_GPU_BACKEND}" == "hip_rocm" ]]; then
+ gpu_backend_version="-${hip_version}"
+ fi
fi
# Check that image exists
-docker image inspect "${image}-${FF_GPU_BACKEND}${cuda_version}":latest > /dev/null
+docker image inspect "${image}-${FF_GPU_BACKEND}${gpu_backend_version}":latest > /dev/null
# Log into container registry
FLEXFLOW_CONTAINER_TOKEN=${FLEXFLOW_CONTAINER_TOKEN:-}
@@ -59,8 +88,8 @@ echo "$FLEXFLOW_CONTAINER_TOKEN" | docker login ghcr.io -u flexflow --password-s
# Tag image to be uploaded
git_sha=${GITHUB_SHA:-$(git rev-parse HEAD)}
if [ -z "$git_sha" ]; then echo "Commit hash cannot be detected, cannot publish the docker image to ghrc.io"; exit; fi
-docker tag "${image}-${FF_GPU_BACKEND}${cuda_version}":latest ghcr.io/flexflow/"${image}-${FF_GPU_BACKEND}${cuda_version}":latest
+docker tag "${image}-${FF_GPU_BACKEND}${gpu_backend_version}":latest ghcr.io/flexflow/"${image}-${FF_GPU_BACKEND}${gpu_backend_version}":latest
# Upload image
-docker push ghcr.io/flexflow/"${image}-${FF_GPU_BACKEND}${cuda_version}":latest
+docker push ghcr.io/flexflow/"${image}-${FF_GPU_BACKEND}${gpu_backend_version}":latest
diff --git a/docker/pull.sh b/docker/pull.sh
index f8624a1072..f641e1a591 100755
--- a/docker/pull.sh
+++ b/docker/pull.sh
@@ -2,7 +2,7 @@
set -euo pipefail
# Usage: ./pull.sh
-# Optional environment variables: FF_GPU_BACKEND, cuda_version
+# Optional environment variables: FF_GPU_BACKEND, cuda_version, hip_version
# Cd into directory holding this script
cd "${BASH_SOURCE[0]%/*}"
@@ -11,6 +11,7 @@ cd "${BASH_SOURCE[0]%/*}"
image=${1:-flexflow}
FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda}
cuda_version=${cuda_version:-"empty"}
+hip_version=${hip_version:-"empty"}
# Check docker image name
if [[ "${image}" != @(flexflow-environment|flexflow) ]]; then
@@ -28,31 +29,63 @@ else
echo "Downloading $image docker image with default GPU backend: cuda"
fi
+# gpu backend version suffix for the docker image.
+gpu_backend_version=""
+
if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then
# Autodetect cuda version if not specified
if [[ $cuda_version == "empty" ]]; then
- cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}')
+ # shellcheck disable=SC2015
+ cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}' || true)
# Change cuda_version eg. V11.7.99 to 11.7
cuda_version=${cuda_version:1:4}
+ if [[ -z "$cuda_version" ]]; then
+ echo "Could not detect CUDA version. Please specify one manually by setting the 'cuda_version' env."
+ exit 1
+ fi
fi
# Check that CUDA version is supported
- if [[ "$cuda_version" != @(11.1|11.3|11.7|11.2|11.5|11.6|11.8) ]]; then
- echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.5|11.6|11.7|11.8}"
+ if [[ "$cuda_version" != @(11.1|11.6|11.7|11.8|12.0|12.1|12.2) ]]; then
+ echo "cuda_version is not available for download, please choose among {11.1|11.6|11.7|11.8|12.0|12.1|12.2}"
exit 1
fi
+ # Use CUDA 12.2 for all versions greater or equal to 12.2 for now
+ if [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
+ cuda_version=12.2
+ fi
# Set cuda version suffix to docker image name
echo "Downloading $image docker image with CUDA $cuda_version"
- cuda_version="-${cuda_version}"
-else
- # Empty cuda version suffix for non-CUDA images
- cuda_version=""
+ gpu_backend_version="-${cuda_version}"
+fi
+
+if [[ "${FF_GPU_BACKEND}" == "hip_rocm" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then
+ # Autodetect HIP version if not specified
+ if [[ $hip_version == "empty" ]]; then
+ # shellcheck disable=SC2015
+ hip_version=$(command -v hipcc >/dev/null 2>&1 && hipcc --version | grep "HIP version:" | awk '{print $NF}' || true)
+ # Change hip_version eg. 5.6.31061-8c743ae5d to 5.6
+ hip_version=${hip_version:0:3}
+ if [[ -z "$hip_version" ]]; then
+ echo "Could not detect HIP version. Please specify one manually by setting the 'hip_version' env."
+ exit 1
+ fi
+ fi
+ # Check that HIP version is supported
+ if [[ "$hip_version" != @(5.3|5.4|5.5|5.6) ]]; then
+ echo "hip_version is not supported, please choose among {5.3, 5.4, 5.5, 5.6}"
+ exit 1
+ fi
+ echo "Downloading $image docker image with HIP $hip_version"
+ if [[ "${FF_GPU_BACKEND}" == "hip_rocm" ]]; then
+ gpu_backend_version="-${hip_version}"
+ fi
fi
# Download image
-docker pull ghcr.io/flexflow/"$image-${FF_GPU_BACKEND}${cuda_version}"
+docker pull ghcr.io/flexflow/"$image-${FF_GPU_BACKEND}${gpu_backend_version}"
# Tag downloaded image
-docker tag ghcr.io/flexflow/"$image-${FF_GPU_BACKEND}${cuda_version}":latest "$image-${FF_GPU_BACKEND}${cuda_version}":latest
+docker tag ghcr.io/flexflow/"$image-${FF_GPU_BACKEND}${gpu_backend_version}":latest "$image-${FF_GPU_BACKEND}${gpu_backend_version}":latest
# Check that image exists
-docker image inspect "${image}-${FF_GPU_BACKEND}${cuda_version}":latest > /dev/null
+docker image inspect "${image}-${FF_GPU_BACKEND}${gpu_backend_version}":latest > /dev/null
diff --git a/docker/run.sh b/docker/run.sh
index 307628f4fd..cf105a10c8 100755
--- a/docker/run.sh
+++ b/docker/run.sh
@@ -2,7 +2,7 @@
set -euo pipefail
# Usage: ./run.sh
-# Optional environment variables: FF_GPU_BACKEND, cuda_version, ATTACH_GPUS, SHM_SIZE
+# Optional environment variables: FF_GPU_BACKEND, cuda_version, hip_version, ATTACH_GPUS, SHM_SIZE
# Cd into directory holding this script
cd "${BASH_SOURCE[0]%/*}"
@@ -11,12 +11,16 @@ cd "${BASH_SOURCE[0]%/*}"
image=${1:-flexflow}
FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda}
cuda_version=${cuda_version:-"empty"}
+hip_version=${hip_version:-"empty"}
# Parameter controlling whether to attach GPUs to the Docker container
ATTACH_GPUS=${ATTACH_GPUS:-true}
gpu_arg=""
if $ATTACH_GPUS ; then gpu_arg="--gpus all" ; fi
+# Whether to attach inference weights / files (make sure to download the weights first)
+ATTACH_INFERENCE_FILES=${ATTACH_INFERENCE_FILES:-false}
+
# Amount of shared memory to give the Docker container access to
# If you get a Bus Error, increase this value. If you don't have enough memory
# on your machine, decrease this value.
@@ -38,35 +42,82 @@ else
echo "Running $image docker image with default GPU backend: cuda"
fi
+# gpu backend version suffix for the docker image.
+gpu_backend_version=""
+
if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then
# Autodetect cuda version if not specified
if [[ $cuda_version == "empty" ]]; then
- cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}')
+ # shellcheck disable=SC2015
+ cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}' || true)
# Change cuda_version eg. V11.7.99 to 11.7
cuda_version=${cuda_version:1:4}
+ if [[ -z "$cuda_version" ]]; then
+ echo "Could not detect CUDA version. Please specify one manually by setting the 'cuda_version' env."
+ exit 1
+ fi
fi
# Check that CUDA version is supported
- if [[ "$cuda_version" != @(11.1|11.3|11.7|11.2|11.5|11.6|11.8) ]]; then
- echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.5|11.6|11.7|11.8}"
+ if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2|12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
+ echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}"
exit 1
fi
+ # Use CUDA 12.2 for all versions greater or equal to 12.2 for now
+ if [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
+ cuda_version=12.2
+ fi
# Set cuda version suffix to docker image name
echo "Running $image docker image with CUDA $cuda_version"
- cuda_version_hyphen="-${cuda_version}"
-else
- # Empty cuda version suffix for non-CUDA images
- cuda_version_hyphen=""
+ gpu_backend_version="-${cuda_version}"
+fi
+
+if [[ "${FF_GPU_BACKEND}" == "hip_rocm" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then
+ # Autodetect HIP version if not specified
+ if [[ $hip_version == "empty" ]]; then
+ # shellcheck disable=SC2015
+ hip_version=$(command -v hipcc >/dev/null 2>&1 && hipcc --version | grep "HIP version:" | awk '{print $NF}' || true)
+ # Change hip_version eg. 5.6.31061-8c743ae5d to 5.6
+ hip_version=${hip_version:0:3}
+ if [[ -z "$hip_version" ]]; then
+ echo "Could not detect HIP version. Please specify one manually by setting the 'hip_version' env."
+ exit 1
+ fi
+ fi
+ # Check that HIP version is supported
+ if [[ "$hip_version" != @(5.3|5.4|5.5|5.6) ]]; then
+ echo "hip_version is not supported, please choose among {5.3, 5.4, 5.5, 5.6}"
+ exit 1
+ fi
+ echo "Running $image docker image with HIP $hip_version"
+ if [[ "${FF_GPU_BACKEND}" == "hip_rocm" ]]; then
+ gpu_backend_version="-${hip_version}"
+ fi
fi
# Check that image exists, if fails, print the default error message.
-if [[ "$(docker images -q "$image"-"$FF_GPU_BACKEND""$cuda_version_hyphen":latest 2> /dev/null)" == "" ]]; then
- echo ""
- echo "To download the docker image, run:"
- echo " FF_GPU_BACKEND=${FF_GPU_BACKEND} cuda_version=${cuda_version} $(pwd)/pull.sh $image"
- echo "To build the docker image from source, run:"
- echo " FF_GPU_BACKEND=${FF_GPU_BACKEND} cuda_version=${cuda_version} $(pwd)/build.sh $image"
- echo ""
+if [[ "$(docker images -q "${image}-${FF_GPU_BACKEND}${gpu_backend_version}":latest 2> /dev/null)" == "" ]]; then
+ echo "Error, ${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest does not exist!"
+ if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then
+ echo ""
+ echo "To download the docker image, run:"
+ echo " FF_GPU_BACKEND=${FF_GPU_BACKEND} cuda_version=${cuda_version} $(pwd)/pull.sh $image"
+ echo "To build the docker image from source, run:"
+ echo " FF_GPU_BACKEND=${FF_GPU_BACKEND} cuda_version=${cuda_version} $(pwd)/build.sh $image"
+ echo ""
+ elif [[ "${FF_GPU_BACKEND}" == "hip_rocm" ]]; then
+ echo ""
+ echo "To download the docker image, run:"
+ echo " FF_GPU_BACKEND=${FF_GPU_BACKEND} hip_version=${hip_version} $(pwd)/pull.sh $image"
+ echo "To build the docker image from source, run:"
+ echo " FF_GPU_BACKEND=${FF_GPU_BACKEND} hip_version=${hip_version} $(pwd)/build.sh $image"
+ echo ""
+ fi
exit 1
fi
-eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "${image}-${FF_GPU_BACKEND}${cuda_version_hyphen}:latest"
+inference_volumes=""
+if $ATTACH_INFERENCE_FILES ; then
+ inference_volumes="-v ~/.cache/flexflow:/usr/FlexFlow/inference";
+fi
+
+eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "${inference_volumes}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest"
diff --git a/docs/Makefile b/docs/Makefile
index 5424c5bc9f..d14c2ef91f 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -15,7 +15,7 @@ help:
.PHONY: help Makefile clean
clean:
- rm -rf build source/_doxygen/ source/c++_api/ doxygen/output
+ rm -rf build doxygen/output doxygen/cpp_api
@$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
# Catch-all target: route all unknown targets to Sphinx using the new
diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile
index b38bfc12b5..aafa65d79b 100644
--- a/docs/doxygen/Doxyfile
+++ b/docs/doxygen/Doxyfile
@@ -44,7 +44,7 @@ PROJECT_NUMBER =
# for a project that appears at the top of each page and should give viewer a
# quick idea about the purpose of the project. Keep the description short.
-PROJECT_BRIEF = A distributed deep learning framework that supports flexible parallelization strategies.
+PROJECT_BRIEF = "A distributed deep learning framework that supports flexible parallelization strategies."
# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
# in the documentation. The maximum height of the logo should not exceed 55
@@ -150,7 +150,7 @@ INLINE_INHERITED_MEMB = NO
# shortest path that makes the file name unique will be used
# The default value is: YES.
-FULL_PATH_NAMES = YES
+FULL_PATH_NAMES = NO
# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
# Stripping is only done if one of the specified strings matches the left-hand
@@ -874,12 +874,7 @@ WARN_LOGFILE =
# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
# Note: If this tag is empty the current directory is searched.
-INPUT = $(FF_HOME)/align
-INPUT += $(FF_HOME)/bootcamp_demo
-INPUT += $(FF_HOME)/examples
INPUT += $(FF_HOME)/include
-INPUT += $(FF_HOME)/nmt
-INPUT += $(FF_HOME)/python
INPUT += $(FF_HOME)/src
# This tag can be used to specify the character encoding of the source files
@@ -911,12 +906,10 @@ INPUT_ENCODING = UTF-8
FILE_PATTERNS = *.c \
*.cc \
- *.cpp \
*.cu \
+ *.cpp \
*.h \
- *.hpp \
- *.md \
- *.py
+ *.hpp
# The RECURSIVE tag can be used to specify whether or not subdirectories should
# be searched for input files as well.
@@ -2110,7 +2103,7 @@ MAN_LINKS = NO
# captures the structure of the code including all documentation.
# The default value is: NO.
-GENERATE_XML = YES
+GENERATE_XML = NO
# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
diff --git a/docs/source/chatbot.rst b/docs/source/chatbot.rst
new file mode 100644
index 0000000000..c41307e231
--- /dev/null
+++ b/docs/source/chatbot.rst
@@ -0,0 +1,64 @@
+:tocdepth: 1
+********
+Chatbot
+********
+
+The chatbot use case involves setting up a conversational AI model using FlexFlow Serve, capable of engaging in interactive dialogues with users.
+
+Requirements
+============
+
+- FlexFlow Serve setup with required configurations.
+- Gradio or any interactive interface tool.
+
+Implementation
+==============
+
+1. FlexFlow Initialization
+ Initialize FlexFlow Serve with desired configurations and specific LLM model.
+
+2. Gradio Interface Setup
+ Define a function for response generation based on user inputs. Setup Gradio Chat Interface for interaction.
+
+ .. code-block:: python
+
+ def generate_response(user_input):
+ result = llm.generate(user_input)
+ return result.output_text.decode('utf-8')
+
+
+3. Running the Interface
+ Launch the Gradio interface and interact with the model by entering text inputs.
+
+ .. image:: /imgs/gradio_interface.png
+ :alt: Gradio Chatbot Interface
+ :align: center
+
+4. Shutdown
+ Stop the FlexFlow server after interaction.
+
+Example
+=======
+
+Complete code example can be found here:
+
+1. `Chatbot Example with incremental decoding `__
+
+2. `Chatbot Example with speculative inference `__
+
+
+Example Implementation:
+
+ .. code-block:: python
+
+ import gradio as gr
+ import flexflow.serve as ff
+
+ ff.init(num_gpus=2, memory_per_gpu=14000, ...)
+
+ def generate_response(user_input):
+ result = llm.generate(user_input)
+ return result.output_text.decode('utf-8')
+
+ iface = gr.ChatInterface(fn=generate_response)
+ iface.launch()
\ No newline at end of file
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 0e614f37c2..f67c0dae01 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -13,28 +13,42 @@
import os
import sys
import subprocess
+import shutil
+import sphinx # only needed for the manual post processing
+from pathlib import Path
+from m2r2 import convert
+from docutils.core import publish_string
+import re
def get_parent_dir_path(path):
return os.path.abspath(os.path.join(path, ".."))
docs_path = get_parent_dir_path(os.path.dirname(os.path.abspath(__file__)))
doxygen_path = os.path.join(docs_path, "doxygen")
+doxygen_output = os.path.join(doxygen_path, "output")
+doxygen_cpp_api_out = os.path.join(doxygen_path, "cpp_api")
FF_HOME = get_parent_dir_path(docs_path)
python_package_path = os.path.join(FF_HOME, "python")
sys.path.insert(0, os.path.abspath(python_package_path))
# Build the Doxygen docs
-#subprocess.call(f'cd {doxygen_path}; FF_HOME={FF_HOME} doxygen', shell=True)
+shutil.rmtree(doxygen_cpp_api_out, ignore_errors=True)
+for gpu_backend in ("cuda", "hip"):
+ doxygen_dest = os.path.join(doxygen_cpp_api_out, f"{gpu_backend}_api")
+ os.makedirs(doxygen_dest, exist_ok=True)
+ exclude_extension = ".cu" if gpu_backend == "hip" else ".cpp"
+ doxygen_cmd = f'export FF_HOME={FF_HOME}; ( cat Doxyfile ; echo "EXCLUDE_PATTERNS+=*{exclude_extension}" ) | doxygen -'
+ subprocess.check_call(doxygen_cmd, cwd=doxygen_path, shell=True)
+ subprocess.check_call(f'mv {os.path.join(doxygen_output, "html")}/* {doxygen_dest}/', shell=True)
import sphinx_rtd_theme
# -- Project information -----------------------------------------------------
project = 'FlexFlow'
-copyright = '2020, Stanford, LANL, CMU, Facebook'
-author = 'Stanford, LANL, CMU, Facebook'
-
+copyright = '2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)'
+author = 'CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)'
# -- General configuration ---------------------------------------------------
@@ -45,8 +59,6 @@ def get_parent_dir_path(path):
'sphinx_rtd_theme',
'sphinx.ext.autodoc',
'm2r2',
- 'breathe',
- 'exhale',
]
# Theme options are theme-specific and customize the look and feel of a theme
@@ -55,6 +67,7 @@ def get_parent_dir_path(path):
html_theme_options = {
"collapse_navigation" : False
}
+html_extra_path = [doxygen_cpp_api_out]
# Add any paths that contain templates here, relative to this directory.
# templates_path = ['_templates']
@@ -86,27 +99,50 @@ def get_parent_dir_path(path):
# so a file named "default.css" will overwrite the builtin "default.css".
# html_static_path = ['_static']
-# Breathe + Exhale configuration
-# Setup the breathe extension
-breathe_projects = {
- "FlexFlow": "./_doxygen/xml"
-}
-breathe_default_project = "FlexFlow"
-
-c_plus_plus_src_dirs = " ".join([f"\"{os.path.join(FF_HOME, 'src', dirname)}\"" for dirname in ("loss_functions", "mapper", "metrics_functions", "ops", "parallel_ops", "recompile", "runtime", "utils")])
-# Setup the exhale extension
-exhale_args = {
- # These arguments are required
- "containmentFolder": "./c++_api",
- "rootFileName": "c++_api_root.rst",
- "doxygenStripFromPath": "..",
- # Heavily encouraged optional argument (see docs)
- #"rootFileTitle": "Library API",
- # Suggested optional arguments
- "createTreeView": True,
- # TIP: if using the sphinx-bootstrap-theme, you need
- # "treeViewIsBootstrap": True,
- "exhaleExecutesDoxygen": True,
- "exhaleDoxygenStdin": f'INPUT = {c_plus_plus_src_dirs}'
-}
+def manual_post_processing(app, exception):
+ if exception is None and app.builder.name == 'html': # build succeeded
+ print(f'Post-processing HTML docs at path {app.outdir}')
+ build_dir = Path(app.outdir)
+
+ # List of subfolders to search
+ folder_paths = [build_dir, build_dir / 'developers_guide']
+
+ for folder_path in folder_paths:
+
+ # Only get HTML files in build dir, not subfolders
+ html_files = folder_path.glob('*.html')
+
+ for html_file in html_files:
+ content = html_file.read_text()
+
+ # Find dropdown menus, and manually convert their contents
+ pattern = r'\nExpand here
\n
(.*?) '
+ blocks = re.findall(pattern, content, re.DOTALL)
+
+ for block in blocks:
+ # Convert Markdown to HTML
+ rst = convert(block, github_markdown=True)
+ html = publish_string(rst, writer_name='html')
+ html_str = html.decode('utf-8')
+
+ # Replace block with converted HTML
+ content = content.replace(block, html_str)
+
+ # Add space after dropdown menu block
+ content = content.replace('',
+ '\n')
+
+ # Replace incorrect links
+ content = content.replace('href="../docker/README.md"', 'href="docker.html"')
+ content = content.replace('href="./TRAIN.md"', 'href="train_overview.html"')
+ content = content.replace('href="./SERVE.md"', 'href="serve_overview.html"')
+ content = content.replace('href="./docs/source/keras.rst"', 'href="keras.html"')
+ content = content.replace('href="./docs/source/onnx.rst"', 'href="onnx.html"')
+
+
+ html_file.write_text(content)
+
+
+def setup(app):
+ app.connect('build-finished', manual_post_processing)
diff --git a/docs/source/cpp_api.rst b/docs/source/cpp_api.rst
new file mode 100644
index 0000000000..b5d39be62e
--- /dev/null
+++ b/docs/source/cpp_api.rst
@@ -0,0 +1,10 @@
+*************
+C++ API
+*************
+
+The FlexFlow backend is at the core of FlexFlow Train and FlexFlow Serve. It is written entirely in C/C++ and CUDA/HIP. This section documents the API, which is generated by Doxygen and it is available at the following links:
+
+* `CUDA version <./cuda_api/index.html>`_ (default version)
+* `HIP version <./hip_api/index.html>`_
+
+The two versions only differ when it comes to the GPU kernels, so the great majority of the entries are identical. If you are unsure which version to use, take a look at the CUDA version.
diff --git a/docs/source/developers_guide.rst b/docs/source/developers_guide/developers_guide.rst
similarity index 64%
rename from docs/source/developers_guide.rst
rename to docs/source/developers_guide/developers_guide.rst
index 107135fae4..a125e60460 100644
--- a/docs/source/developers_guide.rst
+++ b/docs/source/developers_guide/developers_guide.rst
@@ -2,5 +2,5 @@
Developers Guide
******************
-.. mdinclude:: ../../CONTRIBUTING.md
+.. mdinclude:: ../../../CONTRIBUTING.md
:start-line: 2
diff --git a/docs/source/developers_guide/ff_internals.rst b/docs/source/developers_guide/ff_internals.rst
new file mode 100644
index 0000000000..15c0804255
--- /dev/null
+++ b/docs/source/developers_guide/ff_internals.rst
@@ -0,0 +1,6 @@
+*******************
+FlexFlow Internals
+*******************
+
+.. mdinclude:: internals.md
+ :start-line: 2
diff --git a/docs/source/developers_guide/internals.md b/docs/source/developers_guide/internals.md
new file mode 100644
index 0000000000..243b14a174
--- /dev/null
+++ b/docs/source/developers_guide/internals.md
@@ -0,0 +1,15 @@
+# FlexFlow Internals
+
+## The Parallel Computation Graph (PCG)
+
+FlexFlow uses a _Parallel Computation Graph (PCG)_ to simultaneously represent tensor operations, as well as parallelism choices and data movement across nodes.
+
+### Tensor representations
+
+There are two types of tensor representations in FlexFlow: a [Tensor](./cuda_api/de/da9/structFlexFlow_1_1TensorBase.html) and a [ParallelTensor](./cuda_api/d3/dfc/structFlexFlow_1_1ParallelTensorBase.html). The first variant is used when writing a FlexFlow DNN program, whereas the second is used by the runtime to run all the computations in a distributed fashion. `Tensor` and `ParallelTensor` are implemented as typedef-ed pointers to, respectively, the `TensorBase` (defined in `include/flexflow/tensor.h`) and `ParallelTensorBase` (defined in `include/flexflow/parallel_tensor.h`) structs.
+
+The `ParallelTensor` struct contains all the information that a `Tensor` also stores, but in addition, it also codifies how the tensor should be parallelized. For instance, a ParallelTensor records how each dimension is *partitioned*, how many *replicas* of the tensors have been created, and the *mapping* between the partitions of the tensors and the physical machines that will store them.
+
+## Transformation generation
+
+## Joint optimization
diff --git a/docs/source/docker.rst b/docs/source/docker.rst
index 4a457a8dcc..63f84e460c 100644
--- a/docs/source/docker.rst
+++ b/docs/source/docker.rst
@@ -1,3 +1,4 @@
+:tocdepth: 1
*************
Docker
*************
diff --git a/docs/source/imgs/gradio_api.png b/docs/source/imgs/gradio_api.png
new file mode 100644
index 0000000000..7bf1b99a5e
Binary files /dev/null and b/docs/source/imgs/gradio_api.png differ
diff --git a/docs/source/imgs/gradio_interface.png b/docs/source/imgs/gradio_interface.png
new file mode 100644
index 0000000000..9584d76fb3
Binary files /dev/null and b/docs/source/imgs/gradio_interface.png differ
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 7af62e417e..6aa47d157b 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -7,47 +7,40 @@ Welcome to FlexFlow's documentation!
====================================
.. toctree::
- :maxdepth: 2
:caption: Getting Started
welcome
installation
docker
- jupyter
+ multinode
.. toctree::
- :maxdepth: 2
- :caption: Interoperability
+ :caption: FlexFlow Serve
- keras
- pytorch
- onnx
+ serve_overview
+ serve_usecases
+ serve_api
.. toctree::
- :maxdepth: 2
- :caption: Examples
-
- mt5
+ :caption: FlexFlow Train
-.. toctree::
- :maxdepth: 3
- :caption: Python API
+ train_overview
+ train_interface
+ train_examples
- python/models
- python/layers
- python/dataloader
+ train_python_api
.. toctree::
- :maxdepth: 2
- :caption: C++ API
+ :caption: FlexFlow Backend
- c++_api/c++_api_root
+ cpp_api
.. toctree::
- :maxdepth: 2
+ :maxdepth: 3
:caption: Developers Guide
- developers_guide
+ developers_guide/developers_guide.rst
+.. developers_guide/ff_internals.rst
.. Indices and tables
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
index 109b546834..95ec8596e6 100644
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -1,5 +1,6 @@
+:tocdepth: 1
*************
-Installing FlexFlow
+Building from source
*************
.. mdinclude:: ../../INSTALL.md
diff --git a/docs/source/jupyter.rst b/docs/source/jupyter.rst
deleted file mode 100644
index 2e37bfb183..0000000000
--- a/docs/source/jupyter.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-*****************
-Jupyter Notebook
-*****************
-
-.. mdinclude:: ../../jupyter_notebook/README.md
- :start-line: 2
diff --git a/docs/source/keras.rst b/docs/source/keras.rst
index eb4f2d7fa7..f1c0743c70 100644
--- a/docs/source/keras.rst
+++ b/docs/source/keras.rst
@@ -1,6 +1,7 @@
-*************
-Keras Support
-*************
+:tocdepth: 1
+****************
+Keras Interface
+****************
FlexFlow provides a drop-in replacement for TensorFlow Keras. Running an existing Keras program on the FlexFlow backend only requires a few lines of changes to the program. The detailed instructions are as follows:
diff --git a/docs/source/mt5.rst b/docs/source/mt5.rst
index c9c3af080a..8a632b90d6 100644
--- a/docs/source/mt5.rst
+++ b/docs/source/mt5.rst
@@ -1,6 +1,6 @@
-****************
-HuggingFace mT5
-****************
+************************
+mT5 Model
+************************
.. mdinclude:: ../../examples/python/pytorch/mt5/README.md
:start-line: 2
diff --git a/docs/source/multinode.rst b/docs/source/multinode.rst
new file mode 100644
index 0000000000..8827200582
--- /dev/null
+++ b/docs/source/multinode.rst
@@ -0,0 +1,8 @@
+:tocdepth: 1
+******************
+Multinode tutorial
+******************
+
+
+.. mdinclude:: ../../MULTI-NODE.md
+ :start-line: 3
diff --git a/docs/source/onnx.rst b/docs/source/onnx.rst
index 91b314ac96..b6bc49b146 100644
--- a/docs/source/onnx.rst
+++ b/docs/source/onnx.rst
@@ -1,3 +1,4 @@
+:tocdepth: 1
*************
ONNX Support
*************
diff --git a/docs/source/prompt_template.rst b/docs/source/prompt_template.rst
new file mode 100644
index 0000000000..7f987b0f18
--- /dev/null
+++ b/docs/source/prompt_template.rst
@@ -0,0 +1,55 @@
+:tocdepth: 1
+****************
+Prompt Template
+****************
+
+Prompt templates guide the model's response generation. This use case demonstrates setting up FlexFlow Serve to integrate with Langchain and using prompt templates to handle dynamic prompt templates.
+
+Requirements
+============
+
+- FlexFlow Serve setup with appropriate configurations.
+- Langchain integration with templates for prompt management.
+
+Implementation
+==============
+
+1. FlexFlow Initialization
+ Initialize and configure FlexFlow Serve.
+
+2. LLM Setup
+ Compile and start the server for text generation.
+
+3. Prompt Template Setup
+ Setup a prompt template for guiding model's responses.
+
+4. Response Generation
+ Use the LLM with the prompt template to generate a response.
+
+5. Shutdown
+ Stop the FlexFlow server after generating the response.
+
+Example
+=======
+
+Complete code example can be found here:
+
+1. `Prompt Template Example with incremental decoding `__
+
+2. `Prompt Template Example with speculative inference `__
+
+
+Example Implementation:
+
+ .. code-block:: python
+
+ import flexflow.serve as ff
+ from langchain.prompts import PromptTemplate
+
+ ff_llm = FlexFlowLLM(...)
+ ff_llm.compile_and_start(...)
+
+ template = "Question: {question}\nAnswer:"
+ prompt = PromptTemplate(template=template, input_variables=["question"])
+
+ response = ff_llm.generate("Who was the US president in 1997?")
diff --git a/docs/source/python/layers.rst b/docs/source/python/layers.rst
index 91f12094e6..1be91a8b17 100644
--- a/docs/source/python/layers.rst
+++ b/docs/source/python/layers.rst
@@ -3,7 +3,7 @@ Layers API
**********
Layers are the basic building blocks of neural networks in FlexFlow. The inputs of a layer consists of a tensor or a list of tensors and some state variables,
-and the outputs of a layer is a tensor or a list of tensors.
+and the outputs of a layer is a tensor or a list of tensors. See https://github.com/flexflow/FlexFlow/examples/python/native/ops for an example for every layer
.. automodule:: flexflow.core.flexflow_cffi
:noindex:
diff --git a/docs/source/pytorch.rst b/docs/source/pytorch.rst
index a6d4e23311..3dbe337d55 100644
--- a/docs/source/pytorch.rst
+++ b/docs/source/pytorch.rst
@@ -1,6 +1,7 @@
-***************
-PyTorch Support
-***************
+:tocdepth: 1
+******************
+PyTorch Interface
+******************
Users can use FlexFlow to optimize the parallelization performance of existing PyTorch models in two steps.
The PyTorch support requires the `PyTorch FX module `_, so make sure your PyTorch is up to date.
diff --git a/docs/source/rag.rst b/docs/source/rag.rst
new file mode 100644
index 0000000000..640b2fe131
--- /dev/null
+++ b/docs/source/rag.rst
@@ -0,0 +1,90 @@
+:tocdepth: 1
+********
+RAG Q&A
+********
+
+Retrieval Augmented Generation (RAG) combines language models with external knowledge. This use case integrates RAG with FlexFlow Serve for Q&A with documents.
+
+Requirements
+============
+
+- FlexFlow Serve setup.
+- Retriever setup for RAG.
+
+Implementation
+==============
+
+1. FlexFlow Initialization
+ Initialize and configure FlexFlow Serve.
+
+2. Data Retrieval Setup
+ Setup a retriever for sourcing information relevant to user queries.
+
+3. RAG Integration
+ Integrate the retriever with FlexFlow Serve.
+
+4. Response Generation
+ Use the LLM with RAG to generate responses based on model's knowledge and retrieved information.
+
+5. Shutdown
+ The FlexFlow server automatically shuts down after generating the response.
+
+Example
+=======
+
+A complete code example for a web-document Q&A using FlexFlow can be found here:
+
+1. `Rag Q&A Example with incremental decoding `__
+
+2. `Rag Q&A Example with speculative inference `__
+
+
+Example Implementation:
+
+ .. code-block:: python
+
+ # imports
+
+ # compile and start server
+ ff_llm = FlexFlowLLM(...)
+ gen_config = ff.GenerationConfig(...)
+ ff_llm.compile_and_start(...)
+ ff_llm_wrapper = FF_LLM_wrapper(flexflow_llm=ff_llm)
+
+
+ # Load web page content
+ loader = WebBaseLoader("https://example.com/data")
+ data = loader.load()
+
+ # Split text
+ text_splitter = RecursiveCharacterTextSplitter(...)
+ all_splits = text_splitter.split_documents(data)
+
+ # Initialize embeddings
+ embeddings = OpenAIEmbeddings(...)
+
+ # Create VectorStore
+ vectorstore = Chroma.from_documents(all_splits, embeddings)
+
+ # Use VectorStore as a retriever
+ retriever = vectorstore.as_retriever()
+
+ # Apply similarity search
+ question = "Example Question"
+ docs = vectorstore.similarity_search(question)
+ max_chars_per_doc = 100
+ docs_text = ''.join([docs[i].page_content[:max_chars_per_doc] for i in range(len(docs))])
+
+ # Using a Prompt Template
+ prompt_rag = PromptTemplate.from_template(
+ "Summarize the main themes in these retrieved docs: {docs_text}"
+ )
+
+ # Build Chain
+ llm_chain_rag = LLMChain(llm=ff_llm_wrapper, prompt=prompt_rag)
+
+ # Run
+ rag_result = llm_chain_rag(docs_text)
+
+ # Stop the server
+ ff_llm.stop_server()
\ No newline at end of file
diff --git a/docs/source/serve_api.rst b/docs/source/serve_api.rst
new file mode 100644
index 0000000000..6a607cbf0c
--- /dev/null
+++ b/docs/source/serve_api.rst
@@ -0,0 +1,7 @@
+**************************
+FlexFlow Serve Python API
+**************************
+
+.. toctree::
+ serve_fastapi
+ serve_gradioapi
\ No newline at end of file
diff --git a/docs/source/serve_fastapi.rst b/docs/source/serve_fastapi.rst
new file mode 100644
index 0000000000..62a28e5937
--- /dev/null
+++ b/docs/source/serve_fastapi.rst
@@ -0,0 +1,106 @@
+:tocdepth: 1
+***********************
+FlexFlow Serve FastAPI
+***********************
+
+Introduction
+============
+
+The Python API for FlexFlow Serve enables users to initialize, manage and interact with large language models (LLMs) via FastAPI or Gradio.
+
+Requirements
+------------
+
+- FlexFlow Serve setup with necessary configurations.
+- FastAPI and Uvicorn for running the API server.
+
+API Configuration
+=================
+
+Users can configure the API using FastAPI to handle requests and manage the model.
+
+1. FastAPI Application Initialization
+ Initialize the FastAPI application to create API endpoints.
+
+2. Request Model Definition
+ Define the model for API requests using Pydantic.
+
+3. Global Variable for LLM Model
+ Declare a global variable to store the LLM model.
+
+Example
+-------
+
+.. code-block:: python
+
+ from fastapi import FastAPI
+ from pydantic import BaseModel
+ import flexflow.serve as ff
+
+ app = FastAPI()
+
+ class PromptRequest(BaseModel):
+ prompt: str
+
+ llm = None
+
+Endpoint Creation
+=================
+
+Create API endpoints for LLM interactions to handle generation requests.
+
+1. Initialize Model on Startup
+ Use the FastAPI event handler to initialize and compile the LLM model when the API server starts.
+
+2. Generate Response Endpoint
+ Create a POST endpoint to generate responses based on the user's prompt.
+
+Example
+-------
+
+.. code-block:: python
+
+ @app.on_event("startup")
+ async def startup_event():
+ global llm
+ # Initialize and compile the LLM model
+ llm.compile(
+ generation_config,
+ # ... other params as needed
+ )
+ llm.start_server()
+
+ @app.post("/generate/")
+ async def generate(prompt_request: PromptRequest):
+ # ... exception handling
+ full_output = llm.generate([prompt_request.prompt])[0].output_text.decode('utf-8')
+ # ... split prompt and response text for returning results
+ return {"prompt": prompt_request.prompt, "response": full_output}
+
+Running and Testing
+===================
+
+Instructions for running and testing the FastAPI server.
+
+1. Run the FastAPI Server
+ Use Uvicorn to run the FastAPI server with specified host and port.
+
+2. Testing the API
+ Make requests to the API endpoints and verify the responses.
+
+Example
+-------
+
+.. code-block:: bash
+
+ # Running within the inference/python folder:
+ uvicorn entrypoint.fastapi_incr:app --reload --port 3000
+
+Full API Entrypoint Code
+=========================
+
+A complete code example for a web-document Q&A using FlexFlow can be found here:
+
+1. `FastAPI Example with incremental decoding `__
+
+2. `FastAPI Example with speculative inference `__
diff --git a/docs/source/serve_gradioapi.rst b/docs/source/serve_gradioapi.rst
new file mode 100644
index 0000000000..ed19e05347
--- /dev/null
+++ b/docs/source/serve_gradioapi.rst
@@ -0,0 +1,30 @@
+:tocdepth: 1
+*************************
+FlexFlow Serve Gradio API
+*************************
+
+Introduction
+============
+
+Users can also set up the API endpoints with a Gradio Chatbot Interface.
+
+Requirements
+------------
+
+- FlexFlow Serve setup with necessary configurations.
+- Running the gradio chatbot interface.
+
+Example
+========
+
+In a running gradio chatbot interface, hit the "Use via API" button on the bottom left.
+
+ .. image:: /imgs/gradio_interface.png
+ :alt: Gradio Chatbot Interface
+ :align: center
+
+Users can easily access an API endpoint for sending prompts to the model.
+
+ .. image:: /imgs/gradio_api.png
+ :alt: Gradio API
+ :align: center
\ No newline at end of file
diff --git a/docs/source/serve_overview.rst b/docs/source/serve_overview.rst
new file mode 100644
index 0000000000..35c992a853
--- /dev/null
+++ b/docs/source/serve_overview.rst
@@ -0,0 +1,7 @@
+:tocdepth: 1
+*************
+Serving Overview
+*************
+
+.. mdinclude:: ../../SERVE.md
+ :start-line: 3
diff --git a/docs/source/serve_usecases.rst b/docs/source/serve_usecases.rst
new file mode 100644
index 0000000000..4aa3fd2807
--- /dev/null
+++ b/docs/source/serve_usecases.rst
@@ -0,0 +1,8 @@
+*******************
+Serving Usecases
+*******************
+
+.. toctree::
+ chatbot
+ prompt_template
+ rag
\ No newline at end of file
diff --git a/docs/source/train_examples.rst b/docs/source/train_examples.rst
new file mode 100644
index 0000000000..84d58c3465
--- /dev/null
+++ b/docs/source/train_examples.rst
@@ -0,0 +1,6 @@
+*************
+Training Examples
+*************
+
+.. toctree::
+ mt5
\ No newline at end of file
diff --git a/docs/source/train_interface.rst b/docs/source/train_interface.rst
new file mode 100644
index 0000000000..ce81fc1f3c
--- /dev/null
+++ b/docs/source/train_interface.rst
@@ -0,0 +1,8 @@
+*******************
+Training Interface
+*******************
+
+.. toctree::
+ keras
+ pytorch
+ onnx
\ No newline at end of file
diff --git a/docs/source/train_overview.rst b/docs/source/train_overview.rst
new file mode 100644
index 0000000000..58898ad35c
--- /dev/null
+++ b/docs/source/train_overview.rst
@@ -0,0 +1,7 @@
+:tocdepth: 1
+*************
+Training Overview
+*************
+
+.. mdinclude:: ../../TRAIN.md
+ :start-line: 3
diff --git a/docs/source/train_python_api.rst b/docs/source/train_python_api.rst
new file mode 100644
index 0000000000..40451dedf9
--- /dev/null
+++ b/docs/source/train_python_api.rst
@@ -0,0 +1,11 @@
+*******************
+Python API
+*******************
+This section documents the Python API for FlexFlow Train.
+
+.. toctree::
+ :maxdepth: 3
+
+ python/models
+ python/layers
+ python/dataloader
\ No newline at end of file
diff --git a/docs/source/welcome.rst b/docs/source/welcome.rst
index 8108b1dd67..7f73f15563 100644
--- a/docs/source/welcome.rst
+++ b/docs/source/welcome.rst
@@ -1,3 +1,4 @@
+:tocdepth: 1
*************
Overview
*************
diff --git a/examples/cpp/AlexNet/alexnet.cc b/examples/cpp/AlexNet/alexnet.cc
index 128496eab1..3507882329 100644
--- a/examples/cpp/AlexNet/alexnet.cc
+++ b/examples/cpp/AlexNet/alexnet.cc
@@ -26,7 +26,7 @@ using FlexFlow::ParallelTensor;
using FlexFlow::SGDOptimizer;
using FlexFlow::Tensor;
-LegionRuntime::Logger::Category log_app("AlexNet");
+Legion::Logger log_app("AlexNet");
void parse_input_args(char **argv, int argc, AlexNetConfig &config) {
for (int i = 1; i < argc; i++) {
diff --git a/examples/cpp/DLRM/dlrm.cc b/examples/cpp/DLRM/dlrm.cc
index 7dc49215b3..d7dc167557 100644
--- a/examples/cpp/DLRM/dlrm.cc
+++ b/examples/cpp/DLRM/dlrm.cc
@@ -19,7 +19,7 @@
using namespace Legion;
-LegionRuntime::Logger::Category log_app("DLRM");
+Legion::Logger log_app("DLRM");
void parse_input_args(char **argv, int argc, DLRMConfig &apConfig);
diff --git a/examples/cpp/InceptionV3/inception.cc b/examples/cpp/InceptionV3/inception.cc
index b2070cc52d..6d0fa7ee53 100644
--- a/examples/cpp/InceptionV3/inception.cc
+++ b/examples/cpp/InceptionV3/inception.cc
@@ -21,7 +21,7 @@
using namespace Legion;
using namespace FlexFlow;
-LegionRuntime::Logger::Category log_app("Inceptionv3");
+Legion::Logger log_app("Inceptionv3");
Tensor InceptionA(FFModel &ff, Tensor input, int pool_features) {
Tensor t1 = input;
diff --git a/examples/cpp/ResNet/resnet.cc b/examples/cpp/ResNet/resnet.cc
index 455eb743ae..49ce934a6a 100644
--- a/examples/cpp/ResNet/resnet.cc
+++ b/examples/cpp/ResNet/resnet.cc
@@ -24,7 +24,7 @@ using FlexFlow::Optimizer;
using FlexFlow::SGDOptimizer;
using FlexFlow::Tensor;
-LegionRuntime::Logger::Category log_app("ResNet");
+Legion::Logger log_app("ResNet");
void parse_input_args(char **argv, int argc, ResNetConfig &config) {
for (int i = 1; i < argc; i++) {
diff --git a/examples/cpp/Transformer/transformer.cc b/examples/cpp/Transformer/transformer.cc
index d61a63cd03..b04093b0a9 100644
--- a/examples/cpp/Transformer/transformer.cc
+++ b/examples/cpp/Transformer/transformer.cc
@@ -17,7 +17,7 @@
using namespace Legion;
-LegionRuntime::Logger::Category log_app("Transformer");
+Legion::Logger log_app("Transformer");
Tensor create_emb(FFModel *model,
Tensor const &input,
diff --git a/examples/cpp/XDL/xdl.cc b/examples/cpp/XDL/xdl.cc
index 2e6c3cec98..a2272f36e5 100644
--- a/examples/cpp/XDL/xdl.cc
+++ b/examples/cpp/XDL/xdl.cc
@@ -18,7 +18,7 @@
using namespace Legion;
-LegionRuntime::Logger::Category log_app("XDL");
+Legion::Logger log_app("XDL");
void parse_input_args(char **argv, int argc, XDLConfig &apConfig);
diff --git a/examples/cpp/candle_uno/candle_uno.cc b/examples/cpp/candle_uno/candle_uno.cc
index 779b8e9c14..e9f4bf876a 100644
--- a/examples/cpp/candle_uno/candle_uno.cc
+++ b/examples/cpp/candle_uno/candle_uno.cc
@@ -21,7 +21,7 @@
using namespace Legion;
using namespace std;
-LegionRuntime::Logger::Category log_app("Candle_Uno");
+Legion::Logger log_app("Candle_Uno");
void parse_input_args(char **argv, int argc, CandleConfig &apConfig);
diff --git a/examples/cpp/mixture_of_experts/moe.cc b/examples/cpp/mixture_of_experts/moe.cc
index a707310885..a25f94abd9 100644
--- a/examples/cpp/mixture_of_experts/moe.cc
+++ b/examples/cpp/mixture_of_experts/moe.cc
@@ -20,7 +20,7 @@
using namespace Legion;
-LegionRuntime::Logger::Category log_app("MoE");
+Legion::Logger log_app("MoE");
void parse_input_args(char **argv, int argc, MoeConfig &config) {
for (int i = 1; i < argc; i++) {
diff --git a/examples/cpp/resnext50/resnext.cc b/examples/cpp/resnext50/resnext.cc
index 3c28ca27b8..9b71b37cce 100644
--- a/examples/cpp/resnext50/resnext.cc
+++ b/examples/cpp/resnext50/resnext.cc
@@ -7,7 +7,7 @@ using FlexFlow::Optimizer;
using FlexFlow::SGDOptimizer;
using FlexFlow::Tensor;
-LegionRuntime::Logger::Category log_app("resnext");
+Legion::Logger log_app("resnext");
Tensor resnext_block(FFModel &ff,
Tensor input,
diff --git a/examples/cpp/split_test/split_test.cc b/examples/cpp/split_test/split_test.cc
index 97b98c3214..ac9d516a59 100644
--- a/examples/cpp/split_test/split_test.cc
+++ b/examples/cpp/split_test/split_test.cc
@@ -3,7 +3,7 @@
using namespace Legion;
using namespace FlexFlow;
-LegionRuntime::Logger::Category log_app("split_test");
+Legion::Logger log_app("split_test");
void FlexFlow::top_level_task(Task const *task,
std::vector const ®ions,
diff --git a/examples/cpp/split_test_2/split_test_2.cc b/examples/cpp/split_test_2/split_test_2.cc
index 69385d14cb..fef078adbc 100644
--- a/examples/cpp/split_test_2/split_test_2.cc
+++ b/examples/cpp/split_test_2/split_test_2.cc
@@ -9,7 +9,7 @@ using FlexFlow::PCG::Graph;
using FlexFlow::PCG::GraphSearchHelper;
using FlexFlow::PCG::Node;
-LegionRuntime::Logger::Category log_app("split_test_2");
+Legion::Logger log_app("split_test_2");
void top_level_task(Task const *task,
std::vector const ®ions,
diff --git a/examples/python/keras/callback.py b/examples/python/keras/callback.py
index f4ebc03d17..c647822957 100644
--- a/examples/python/keras/callback.py
+++ b/examples/python/keras/callback.py
@@ -20,6 +20,7 @@
from flexflow.keras.datasets import cifar10
from flexflow.keras import backend as K
from accuracy import ModelAccuracy
+import flexflow.core as ff
import numpy as np
@@ -68,4 +69,6 @@ def top_level_task():
if __name__ == "__main__":
print("Functional API, cifar10 cnn callback")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
diff --git a/examples/python/keras/elementwise_max_min.py b/examples/python/keras/elementwise_max_min.py
index 95291f1273..52a80b431b 100644
--- a/examples/python/keras/elementwise_max_min.py
+++ b/examples/python/keras/elementwise_max_min.py
@@ -1,5 +1,6 @@
from flexflow.keras.layers import Dense, Input, Maximum, Minimum
import flexflow.keras.optimizers
+import flexflow.core as ff
import numpy as np
@@ -54,7 +55,8 @@ def elementwise_min():
epochs = 2
)
-
if __name__ == '__main__':
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
elementwise_max()
elementwise_min()
diff --git a/examples/python/keras/elementwise_mul_broadcast.py b/examples/python/keras/elementwise_mul_broadcast.py
index d68476a6cb..1405871a7a 100644
--- a/examples/python/keras/elementwise_mul_broadcast.py
+++ b/examples/python/keras/elementwise_mul_broadcast.py
@@ -1,6 +1,6 @@
from flexflow.keras.layers import Dense, Input, Reshape, Multiply
import flexflow.keras.optimizers
-
+import flexflow.core as ff
import numpy as np
def broadcast1():
@@ -92,8 +92,9 @@ def broadcast_both():
epochs = 2
)
-
if __name__ == '__main__':
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
broadcast1()
broadcast2()
broadcast_both()
diff --git a/examples/python/keras/func_cifar10_alexnet.py b/examples/python/keras/func_cifar10_alexnet.py
index c0ade0b722..a4f8dc61ac 100644
--- a/examples/python/keras/func_cifar10_alexnet.py
+++ b/examples/python/keras/func_cifar10_alexnet.py
@@ -77,5 +77,7 @@ def top_level_task():
if __name__ == "__main__":
print("Functional API, cifar10 alexnet")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
gc.collect()
diff --git a/examples/python/keras/func_cifar10_cnn.py b/examples/python/keras/func_cifar10_cnn.py
index 423541386f..ce0358da53 100644
--- a/examples/python/keras/func_cifar10_cnn.py
+++ b/examples/python/keras/func_cifar10_cnn.py
@@ -61,7 +61,10 @@ def top_level_task():
model.fit(x_train, y_train, epochs=160, callbacks=[VerifyMetrics(ModelAccuracy.CIFAR10_CNN), EpochVerifyMetrics(ModelAccuracy.CIFAR10_CNN)])
+
if __name__ == "__main__":
print("Functional API, cifar10 cnn")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
gc.collect()
diff --git a/examples/python/keras/func_cifar10_cnn_concat.py b/examples/python/keras/func_cifar10_cnn_concat.py
index 72dfdeffaf..4fe0f5ce18 100644
--- a/examples/python/keras/func_cifar10_cnn_concat.py
+++ b/examples/python/keras/func_cifar10_cnn_concat.py
@@ -75,5 +75,7 @@ def top_level_task():
if __name__ == "__main__":
print("Functional API, cifar10 cnn concat")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
gc.collect()
diff --git a/examples/python/keras/func_cifar10_cnn_concat_model.py b/examples/python/keras/func_cifar10_cnn_concat_model.py
index 39885bac8c..c8838de1eb 100644
--- a/examples/python/keras/func_cifar10_cnn_concat_model.py
+++ b/examples/python/keras/func_cifar10_cnn_concat_model.py
@@ -75,7 +75,10 @@ def top_level_task():
model.fit([x_train, x_train], y_train, epochs=160, callbacks=[VerifyMetrics(ModelAccuracy.CIFAR10_CNN), EpochVerifyMetrics(ModelAccuracy.CIFAR10_CNN)])
+
if __name__ == "__main__":
print("Functional API, cifar10 cnn concat model")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
gc.collect()
diff --git a/examples/python/keras/func_cifar10_cnn_concat_seq_model.py b/examples/python/keras/func_cifar10_cnn_concat_seq_model.py
index cda95beb49..3e4f939283 100644
--- a/examples/python/keras/func_cifar10_cnn_concat_seq_model.py
+++ b/examples/python/keras/func_cifar10_cnn_concat_seq_model.py
@@ -68,7 +68,10 @@ def top_level_task():
model.fit([x_train, x_train], y_train, epochs=160, callbacks=[VerifyMetrics(ModelAccuracy.CIFAR10_CNN), EpochVerifyMetrics(ModelAccuracy.CIFAR10_CNN)])
+
if __name__ == "__main__":
print("Functional API, cifar10 cnn concat sequential model")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
gc.collect()
diff --git a/examples/python/keras/func_cifar10_cnn_nested.py b/examples/python/keras/func_cifar10_cnn_nested.py
index def8a6bcf4..7391ba5a2b 100644
--- a/examples/python/keras/func_cifar10_cnn_nested.py
+++ b/examples/python/keras/func_cifar10_cnn_nested.py
@@ -67,7 +67,10 @@ def top_level_task():
model.fit(x_train, y_train, epochs=160, callbacks=[VerifyMetrics(ModelAccuracy.CIFAR10_CNN), EpochVerifyMetrics(ModelAccuracy.CIFAR10_CNN)])
+
if __name__ == "__main__":
print("Functional API, cifar10 cnn nested")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
gc.collect()
diff --git a/examples/python/keras/func_cifar10_cnn_net2net.py b/examples/python/keras/func_cifar10_cnn_net2net.py
index 5434e28aca..695a1157dd 100644
--- a/examples/python/keras/func_cifar10_cnn_net2net.py
+++ b/examples/python/keras/func_cifar10_cnn_net2net.py
@@ -120,5 +120,7 @@ def top_level_task():
if __name__ == "__main__":
print("Functional API, cifarf10 cnn teach student")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
gc.collect()
diff --git a/examples/python/keras/func_mnist_cnn.py b/examples/python/keras/func_mnist_cnn.py
index a81ddd0f94..8f2041dfe2 100644
--- a/examples/python/keras/func_mnist_cnn.py
+++ b/examples/python/keras/func_mnist_cnn.py
@@ -70,7 +70,10 @@ def top_level_task():
model.fit(x_train, y_train, epochs=5, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_CNN), EpochVerifyMetrics(ModelAccuracy.MNIST_CNN)])
+
if __name__ == "__main__":
print("Functional API, mnist cnn")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
gc.collect()
diff --git a/examples/python/keras/func_mnist_cnn_concat.py b/examples/python/keras/func_mnist_cnn_concat.py
index 54c1f32d36..64bb2cdbb0 100644
--- a/examples/python/keras/func_mnist_cnn_concat.py
+++ b/examples/python/keras/func_mnist_cnn_concat.py
@@ -61,7 +61,10 @@ def top_level_task():
model.fit(x_train, y_train, epochs=5, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_CNN), EpochVerifyMetrics(ModelAccuracy.MNIST_CNN)])
+
if __name__ == "__main__":
print("Functional API, mnist cnn concat")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
gc.collect()
diff --git a/examples/python/keras/func_mnist_mlp.py b/examples/python/keras/func_mnist_mlp.py
index 5521f193c1..ddf2022366 100644
--- a/examples/python/keras/func_mnist_mlp.py
+++ b/examples/python/keras/func_mnist_mlp.py
@@ -54,7 +54,10 @@ def top_level_task():
model.fit(x_train, y_train, epochs=10, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_MLP), EpochVerifyMetrics(ModelAccuracy.MNIST_MLP)])
+
if __name__ == "__main__":
print("Functional API, mnist mlp")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
gc.collect()
diff --git a/examples/python/keras/func_mnist_mlp_concat.py b/examples/python/keras/func_mnist_mlp_concat.py
index 29b982cea8..6b282f65e6 100644
--- a/examples/python/keras/func_mnist_mlp_concat.py
+++ b/examples/python/keras/func_mnist_mlp_concat.py
@@ -76,7 +76,10 @@ def top_level_task():
model.fit(x_train, y_train, epochs=5, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_MLP), EpochVerifyMetrics(ModelAccuracy.MNIST_MLP)])
+
if __name__ == "__main__":
print("Functional API, mnist mlp concat")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
gc.collect()
diff --git a/examples/python/keras/func_mnist_mlp_concat2.py b/examples/python/keras/func_mnist_mlp_concat2.py
index 5a35bd9f8b..b309a00187 100644
--- a/examples/python/keras/func_mnist_mlp_concat2.py
+++ b/examples/python/keras/func_mnist_mlp_concat2.py
@@ -87,7 +87,10 @@ def top_level_task():
model.fit([x_train, x_train, x_train], y_train, epochs=10, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_MLP), EpochVerifyMetrics(ModelAccuracy.MNIST_MLP)])
+
if __name__ == "__main__":
print("Functional API, mnist mlp concat with input")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
gc.collect()
diff --git a/examples/python/keras/func_mnist_mlp_net2net.py b/examples/python/keras/func_mnist_mlp_net2net.py
index ed8589e22e..0b44029938 100644
--- a/examples/python/keras/func_mnist_mlp_net2net.py
+++ b/examples/python/keras/func_mnist_mlp_net2net.py
@@ -88,7 +88,10 @@ def top_level_task():
student_model.fit(x_train, y_train, epochs=160, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_MLP), EpochVerifyMetrics(ModelAccuracy.MNIST_MLP)])
+
if __name__ == "__main__":
print("Functional API, mnist mlp teach student")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
- gc.collect()
\ No newline at end of file
+ gc.collect()
diff --git a/examples/python/keras/gather.py b/examples/python/keras/gather.py
index 15ccd61579..f14d737d17 100644
--- a/examples/python/keras/gather.py
+++ b/examples/python/keras/gather.py
@@ -1,7 +1,7 @@
from flexflow.keras.layers import Dense, Input, Reshape
from flexflow.keras.backend.internal import gather
import flexflow.keras.optimizers
-
+import flexflow.core as ff
import numpy as np
@@ -42,4 +42,6 @@ def gather_example():
if __name__ == '__main__':
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
gather_example()
diff --git a/examples/python/keras/identity_loss.py b/examples/python/keras/identity_loss.py
index d0396c6d46..8e26fc246b 100644
--- a/examples/python/keras/identity_loss.py
+++ b/examples/python/keras/identity_loss.py
@@ -15,7 +15,7 @@
from flexflow.keras.layers import Dense, Input, Reshape, Multiply
import flexflow.keras.optimizers
-
+import flexflow.core as ff
import numpy as np
def test_identity_loss():
@@ -36,4 +36,6 @@ def test_identity_loss():
if __name__ == "__main__":
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
test_identity_loss()
diff --git a/examples/python/keras/reduce_sum.py b/examples/python/keras/reduce_sum.py
index 3857738d4b..33030e2cec 100644
--- a/examples/python/keras/reduce_sum.py
+++ b/examples/python/keras/reduce_sum.py
@@ -15,7 +15,7 @@
from flexflow.keras.layers import Dense, Input, Reshape, Multiply
import flexflow.keras.optimizers
-
+import flexflow.core as ff
import numpy as np
def test_reduce_sum1():
@@ -74,6 +74,8 @@ def test_reduce_sum3():
if __name__ == "__main__":
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
test_reduce_sum1()
test_reduce_sum2()
test_reduce_sum3()
diff --git a/examples/python/keras/regularizer.py b/examples/python/keras/regularizer.py
index 3b1e30d04d..3a24129db2 100644
--- a/examples/python/keras/regularizer.py
+++ b/examples/python/keras/regularizer.py
@@ -2,7 +2,7 @@
from flexflow.keras.layers import Dense, Input, Reshape
from flexflow.keras.backend.internal import gather
import flexflow.keras.optimizers
-
+import flexflow.core as ff
import numpy as np
@@ -26,4 +26,6 @@ def regularizer_example():
if __name__ == '__main__':
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
regularizer_example()
diff --git a/examples/python/keras/reshape.py b/examples/python/keras/reshape.py
index 1acce1b2b6..ae756a8f70 100644
--- a/examples/python/keras/reshape.py
+++ b/examples/python/keras/reshape.py
@@ -55,7 +55,10 @@ def top_level_task():
print(model.summary())
model.fit(x_train, y_train, epochs=10, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_MLP), EpochVerifyMetrics(ModelAccuracy.MNIST_MLP)])
+
if __name__ == "__main__":
print("Functional API, mnist mlp")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
gc.collect()
diff --git a/examples/python/keras/rsqrt.py b/examples/python/keras/rsqrt.py
index be55c8a1fd..e33873ecd5 100644
--- a/examples/python/keras/rsqrt.py
+++ b/examples/python/keras/rsqrt.py
@@ -16,7 +16,7 @@
from flexflow.keras.layers import Dense, Input
from flexflow.keras.backend.internal import rsqrt
import flexflow.keras.optimizers
-
+import flexflow.core as ff
import numpy as np
def test_rsqrt():
@@ -40,4 +40,6 @@ def test_rsqrt():
if __name__ == "__main__":
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
test_rsqrt()
diff --git a/examples/python/keras/seq_cifar10_cnn.py b/examples/python/keras/seq_cifar10_cnn.py
index 80f4390d4c..66ea8530e0 100644
--- a/examples/python/keras/seq_cifar10_cnn.py
+++ b/examples/python/keras/seq_cifar10_cnn.py
@@ -54,6 +54,9 @@ def top_level_task():
model.fit(x_train, y_train, epochs=80, callbacks=[VerifyMetrics(ModelAccuracy.CIFAR10_CNN), EpochVerifyMetrics(ModelAccuracy.CIFAR10_CNN)])
+
if __name__ == "__main__":
- print("Sequantial model, cifar10 cnn")
+ print("Sequential model, cifar10 cnn")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
diff --git a/examples/python/keras/seq_mnist_cnn.py b/examples/python/keras/seq_mnist_cnn.py
index eaf0fdfc16..09ad4ea4cf 100644
--- a/examples/python/keras/seq_mnist_cnn.py
+++ b/examples/python/keras/seq_mnist_cnn.py
@@ -55,6 +55,9 @@ def top_level_task():
model.fit(x_train, y_train, epochs=5, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_CNN), EpochVerifyMetrics(ModelAccuracy.MNIST_CNN)])
+
if __name__ == "__main__":
print("Sequential model, mnist cnn")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
diff --git a/examples/python/keras/seq_mnist_cnn_nested.py b/examples/python/keras/seq_mnist_cnn_nested.py
index 2c92349cd6..628129ddb9 100644
--- a/examples/python/keras/seq_mnist_cnn_nested.py
+++ b/examples/python/keras/seq_mnist_cnn_nested.py
@@ -65,6 +65,9 @@ def top_level_task():
model.fit(x_train, y_train, epochs=5, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_CNN), EpochVerifyMetrics(ModelAccuracy.MNIST_CNN)])
+
if __name__ == "__main__":
print("Sequential model, mnist cnn nested model")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
diff --git a/examples/python/keras/seq_mnist_cnn_net2net.py b/examples/python/keras/seq_mnist_cnn_net2net.py
index 4b9c9c16ba..e2a04ba686 100644
--- a/examples/python/keras/seq_mnist_cnn_net2net.py
+++ b/examples/python/keras/seq_mnist_cnn_net2net.py
@@ -98,6 +98,9 @@ def top_level_task():
create_student_model_cnn(teacher_model, num_classes, x_train, y_train)
+
if __name__ == "__main__":
print("Sequential model, mnist mlp teacher student")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
diff --git a/examples/python/keras/seq_mnist_mlp.py b/examples/python/keras/seq_mnist_mlp.py
index 21c7435eb7..46b774a2e1 100644
--- a/examples/python/keras/seq_mnist_mlp.py
+++ b/examples/python/keras/seq_mnist_mlp.py
@@ -55,6 +55,9 @@ def top_level_task():
model.fit(x_train, y_train, epochs=20, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_MLP), EpochVerifyMetrics(ModelAccuracy.MNIST_MLP)])
model.evaluate(x=x_train, y=y_train)
+
if __name__ == "__main__":
print("Sequential model, mnist mlp")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
diff --git a/examples/python/keras/seq_mnist_mlp_net2net.py b/examples/python/keras/seq_mnist_mlp_net2net.py
index 628f76db3a..c7a7d7a6f8 100644
--- a/examples/python/keras/seq_mnist_mlp_net2net.py
+++ b/examples/python/keras/seq_mnist_mlp_net2net.py
@@ -91,6 +91,9 @@ def top_level_task():
create_student_model_mlp(teacher_model, num_classes, x_train, y_train)
+
if __name__ == "__main__":
print("Sequential model, mnist mlp teacher student")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
diff --git a/examples/python/keras/seq_reuters_mlp.py b/examples/python/keras/seq_reuters_mlp.py
index 5412ad0599..ed748f67d8 100644
--- a/examples/python/keras/seq_reuters_mlp.py
+++ b/examples/python/keras/seq_reuters_mlp.py
@@ -19,6 +19,7 @@
from flexflow.keras.datasets import reuters
from flexflow.keras.preprocessing.text import Tokenizer
from flexflow.keras.callbacks import Callback, VerifyMetrics
+import flexflow.core as ff
import numpy as np
from accuracy import ModelAccuracy
@@ -61,6 +62,9 @@ def top_level_task():
model.fit(x_train, y_train, epochs=epochs, callbacks=[VerifyMetrics(ModelAccuracy.REUTERS_MLP)])
+
if __name__ == "__main__":
print("Sequential model, reuters mlp")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
diff --git a/examples/python/keras/unary.py b/examples/python/keras/unary.py
index 622e15dc2d..63c83b9af2 100644
--- a/examples/python/keras/unary.py
+++ b/examples/python/keras/unary.py
@@ -62,4 +62,6 @@ def top_level_task():
if __name__ == "__main__":
print("alexnet keras")
- top_level_task()
\ No newline at end of file
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
+ top_level_task()
diff --git a/examples/python/native/alexnet.py b/examples/python/native/alexnet.py
index 61397cefc1..6d6e58a7f2 100644
--- a/examples/python/native/alexnet.py
+++ b/examples/python/native/alexnet.py
@@ -3,7 +3,7 @@
from accuracy import ModelAccuracy
from PIL import Image
-import argparse
+import argparse, json
import numpy as np
@@ -133,7 +133,18 @@ def test_accuracy():
parser = argparse.ArgumentParser()
parser.add_argument("-a", "--test_acc",
action="store_true", help="Test accuracy flag")
+ parser.add_argument(
+ "-config-file",
+ help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.",
+ type=str,
+ default=None,
+ )
args, unknown = parser.parse_known_args()
+ configs_dict = None
+ if args.config_file is not None:
+ with open(args.config_file) as f:
+ configs_dict = json.load(f)
+ init_flexflow_runtime(configs_dict)
if args.test_acc:
print("Testing cifar10 alexnet training accuracy")
test_accuracy()
diff --git a/examples/python/native/cifar10_cnn.py b/examples/python/native/cifar10_cnn.py
index 44bdce4519..11bc936617 100644
--- a/examples/python/native/cifar10_cnn.py
+++ b/examples/python/native/cifar10_cnn.py
@@ -2,7 +2,7 @@
from flexflow.keras.datasets import cifar10
from accuracy import ModelAccuracy
-import argparse
+import argparse, json
def top_level_task():
@@ -90,7 +90,18 @@ def test_accuracy():
parser = argparse.ArgumentParser()
parser.add_argument("-a", "--test_acc",
action="store_true", help="Test accuracy flag")
+ parser.add_argument(
+ "-config-file",
+ help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.",
+ type=str,
+ default=None,
+ )
args, unknown = parser.parse_known_args()
+ configs_dict = None
+ if args.config_file is not None:
+ with open(args.config_file) as f:
+ configs_dict = json.load(f)
+ init_flexflow_runtime(configs_dict)
if args.test_acc:
print("Testing cifar10 cnn training accuracy")
test_accuracy()
diff --git a/examples/python/native/cifar10_cnn_attach.py b/examples/python/native/cifar10_cnn_attach.py
index ba4288c8cd..e200cc03cf 100644
--- a/examples/python/native/cifar10_cnn_attach.py
+++ b/examples/python/native/cifar10_cnn_attach.py
@@ -144,4 +144,6 @@ def top_level_task():
if __name__ == "__main__":
print("cifar10 cnn attach")
+ configs = get_configs()
+ init_flexflow_runtime(configs)
top_level_task()
diff --git a/examples/python/native/cifar10_cnn_concat.py b/examples/python/native/cifar10_cnn_concat.py
index b177295ad6..7234116b3c 100644
--- a/examples/python/native/cifar10_cnn_concat.py
+++ b/examples/python/native/cifar10_cnn_concat.py
@@ -70,6 +70,10 @@ def top_level_task():
if accuracy < ModelAccuracy.CIFAR10_CNN.value:
assert 0, 'Check Accuracy'
+
+
if __name__ == "__main__":
print("cifar10 cnn concat")
+ configs = get_configs()
+ init_flexflow_runtime(configs)
top_level_task()
diff --git a/examples/python/native/mnist_cnn.py b/examples/python/native/mnist_cnn.py
index 6eabbe57db..f6787a4827 100644
--- a/examples/python/native/mnist_cnn.py
+++ b/examples/python/native/mnist_cnn.py
@@ -18,7 +18,7 @@
from flexflow.keras.datasets import mnist
from accuracy import ModelAccuracy
-import argparse
+import argparse, json
def top_level_task():
@@ -89,7 +89,18 @@ def test_accuracy():
parser = argparse.ArgumentParser()
parser.add_argument("-a", "--test_acc",
action="store_true", help="Test accuracy flag")
+ parser.add_argument(
+ "-config-file",
+ help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.",
+ type=str,
+ default=None,
+ )
args, unknown = parser.parse_known_args()
+ configs_dict = None
+ if args.config_file is not None:
+ with open(args.config_file) as f:
+ configs_dict = json.load(f)
+ init_flexflow_runtime(configs_dict)
if args.test_acc:
print("Testing mnist cnn training accuracy")
test_accuracy()
diff --git a/examples/python/native/mnist_mlp.py b/examples/python/native/mnist_mlp.py
index aefe7cfd57..8763eba40c 100644
--- a/examples/python/native/mnist_mlp.py
+++ b/examples/python/native/mnist_mlp.py
@@ -3,7 +3,7 @@
from flexflow.keras.datasets import mnist
from accuracy import ModelAccuracy
-import argparse
+import argparse, json
def top_level_task():
@@ -75,7 +75,18 @@ def test_accuracy():
parser = argparse.ArgumentParser()
parser.add_argument("-a", "--test_acc",
action="store_true", help="Test accuracy flag")
+ parser.add_argument(
+ "-config-file",
+ help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.",
+ type=str,
+ default=None,
+ )
args, unknown = parser.parse_known_args()
+ configs_dict = None
+ if args.config_file is not None:
+ with open(args.config_file) as f:
+ configs_dict = json.load(f)
+ init_flexflow_runtime(configs_dict)
if args.test_acc:
print("Testing mnist mlp training accuracy")
test_accuracy()
diff --git a/examples/python/native/mnist_mlp_attach.py b/examples/python/native/mnist_mlp_attach.py
index 6e7c8f8405..1294432ec5 100644
--- a/examples/python/native/mnist_mlp_attach.py
+++ b/examples/python/native/mnist_mlp_attach.py
@@ -134,4 +134,6 @@ def top_level_task():
if __name__ == "__main__":
print("mnist mlp attach")
+ configs = get_configs()
+ init_flexflow_runtime(configs)
top_level_task()
diff --git a/examples/python/native/ops/add.py b/examples/python/native/ops/add.py
new file mode 100644
index 0000000000..50b9d16fd0
--- /dev/null
+++ b/examples/python/native/ops/add.py
@@ -0,0 +1,45 @@
+# The basis for this test of the 'add' operation is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_add(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT)
+ input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT)
+
+ out = ffmodel.add(input_tensor1, input_tensor2)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1)
+ dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2)
+
+ ffmodel.init_layers()
+
+ dataloader_input1.reset()
+ dataloader_input1.next_batch(ffmodel)
+
+ dataloader_input2.reset()
+ dataloader_input2.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ input2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+ _ = test_add(ffconfig, input1, input2)
diff --git a/examples/python/native/ops/add_bias_residual_layer_norm.py b/examples/python/native/ops/add_bias_residual_layer_norm.py
new file mode 100644
index 0000000000..6e8dffbc9e
--- /dev/null
+++ b/examples/python/native/ops/add_bias_residual_layer_norm.py
@@ -0,0 +1,78 @@
+from typing import List
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_add_bias_residual_layer_norm(ffconfig, input_arr: np.ndarray, residual_arr: np.ndarray, axes: List[int], elementwise_affine: bool = True, eps: float = 1e-5, use_bias: bool = True, name=None):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+ residual_tensor = ffmodel.create_tensor(residual_arr.shape, DataType.DT_FLOAT)
+
+ output_tensor, layer_norm_output = ffmodel.add_bias_residual_layer_norm(
+ input_tensor,
+ residual_tensor,
+ axes=axes,
+ elementwise_affine=elementwise_affine,
+ eps=eps,
+ use_bias=use_bias,
+ name="add_bias_residual_layer_norm_layer"
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+ dataloader_residual = ffmodel.create_data_loader(residual_tensor, residual_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_residual.reset()
+
+ dataloader_input.next_batch(ffmodel)
+ dataloader_residual.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ output_tensor.inline_map(ffmodel, ffconfig)
+ layer_norm_output.inline_map(ffmodel, ffconfig)
+ output_result = output_tensor.get_array(ffmodel, ffconfig)
+ layer_norm_result = layer_norm_output.get_array(ffmodel, ffconfig)
+
+ return output_result, layer_norm_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ residual_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+ axes_to_normalize = [1, 2] # Example axes to normalize
+
+ output_result, layer_norm_result = test_add_bias_residual_layer_norm(
+ ffconfig,
+ input_data,
+ residual_data,
+ axes=axes_to_normalize,
+ elementwise_affine=True,
+ eps=1e-5,
+ use_bias=True
+ )
+
+ print("Input Array:")
+ print(input_data)
+ print("\nResidual Array:")
+ print(residual_data)
+ print(f"\nOutput Array after applying add_bias_residual_layer_norm along axes {axes_to_normalize}:")
+ print(output_result)
+ print("\nLayer Norm Result:")
+ print(layer_norm_result)
diff --git a/examples/python/native/ops/arg_top_k.py b/examples/python/native/ops/arg_top_k.py
new file mode 100644
index 0000000000..79edc5dfad
--- /dev/null
+++ b/examples/python/native/ops/arg_top_k.py
@@ -0,0 +1,61 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_arg_top_k(ffconfig, input_arr: np.ndarray, k: int, sorted: bool, speculative_decoding: bool, name=None):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ arg_top_k_output = ffmodel.arg_top_k(
+ input_tensor,
+ k,
+ sorted,
+ speculative_decoding,
+ name="arg_top_k_layer",
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_MEAN_SQUARED_ERROR,
+ metrics=[MetricsType.METRICS_MEAN_SQUARED_ERROR],
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ arg_top_k_output.inline_map(ffmodel, ffconfig)
+ output_result = arg_top_k_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 10).astype(np.float32)
+ k_value = 5
+ sorted_value = True
+ speculative_decoding_value = False # Example value for speculative_decoding
+
+ output_result = test_arg_top_k(
+ ffconfig,
+ input_data,
+ k=k_value,
+ sorted=sorted_value,
+ speculative_decoding=speculative_decoding_value,
+ )
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying arg_top_k:")
+ print(output_result)
diff --git a/examples/python/native/ops/argmax.py b/examples/python/native/ops/argmax.py
new file mode 100644
index 0000000000..dda0e6b0bc
--- /dev/null
+++ b/examples/python/native/ops/argmax.py
@@ -0,0 +1,55 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_argmax(ffconfig, input_arr: np.ndarray, beam_search: bool, name=None):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ argmax_output = ffmodel.argmax(
+ input_tensor,
+ beam_search,
+ name="argmax_layer",
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ argmax_output.inline_map(ffmodel, ffconfig)
+ output_result = argmax_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 10).astype(np.float32)
+ beam_search_value = True # Set to True or False based on your requirement
+
+ output_result = test_argmax(
+ ffconfig,
+ input_data,
+ beam_search=beam_search_value,
+ )
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying argmax:")
+ print(output_result)
diff --git a/examples/python/native/ops/batch_matmul.py b/examples/python/native/ops/batch_matmul.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/examples/python/native/ops/batch_norm.py b/examples/python/native/ops/batch_norm.py
new file mode 100644
index 0000000000..b243e79d37
--- /dev/null
+++ b/examples/python/native/ops/batch_norm.py
@@ -0,0 +1,36 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def ff(ffconfig, input_arr: np.ndarray):
+ ffmodel = FFModel(ffconfig)
+ # TODO: convert input to ff tensor
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ out = ffmodel.batch_norm(
+ input_tensor
+ )
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ _ = ff(ffconfig, input)
diff --git a/examples/python/native/ops/beam_top_k.py b/examples/python/native/ops/beam_top_k.py
new file mode 100644
index 0000000000..cb2fdfb3d2
--- /dev/null
+++ b/examples/python/native/ops/beam_top_k.py
@@ -0,0 +1,58 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_beam_top_k(ffconfig, input_arr: np.ndarray, max_beam_size: int, sorted: bool, name=None):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ beam_top_k_output = ffmodel.beam_top_k(
+ input_tensor,
+ max_beam_size,
+ sorted,
+ name="beam_top_k_layer",
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ beam_top_k_output.inline_map(ffmodel, ffconfig)
+ output_result = beam_top_k_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 10).astype(np.float32)
+ max_beam_size_value = 3
+ sorted_value = True
+
+ output_result = test_beam_top_k(
+ ffconfig,
+ input_data,
+ max_beam_size=max_beam_size_value,
+ sorted=sorted_value,
+ )
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying beam_top_k:")
+ print(output_result)
diff --git a/examples/python/native/ops/concat.py b/examples/python/native/ops/concat.py
new file mode 100644
index 0000000000..0088d7b848
--- /dev/null
+++ b/examples/python/native/ops/concat.py
@@ -0,0 +1,43 @@
+# The basis for this test of the 'concatenate' operation is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_concatenate(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT)
+ input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT)
+
+ out = ffmodel.concat([input_tensor1, input_tensor2], axis=1)
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1)
+ dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2)
+
+ ffmodel.init_layers()
+
+ dataloader_input1.reset()
+ dataloader_input1.next_batch(ffmodel)
+
+ dataloader_input2.reset()
+ dataloader_input2.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ input2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ _ = test_concatenate(ffconfig, input1, input2)
diff --git a/examples/python/native/ops/conv2d.py b/examples/python/native/ops/conv2d.py
new file mode 100644
index 0000000000..02b3646aaa
--- /dev/null
+++ b/examples/python/native/ops/conv2d.py
@@ -0,0 +1,45 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def ff(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ out = ffmodel.conv2d(
+ input_tensor,
+ 32,
+ 3,
+ 3,
+ 1,
+ 1,
+ 1,
+ 1,
+ use_bias=False
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ _ = ff(ffconfig, input)
diff --git a/examples/python/native/ops/cos.py b/examples/python/native/ops/cos.py
new file mode 100644
index 0000000000..26f6307685
--- /dev/null
+++ b/examples/python/native/ops/cos.py
@@ -0,0 +1,44 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_cos(ffconfig, input_arr: np.ndarray) -> np.ndarray:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ cos_output = ffmodel.cos(input_tensor, name="cos_layer")
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+ ffmodel.forward()
+
+ cos_output.inline_map(ffmodel, ffconfig)
+ cos_result = cos_output.get_array(ffmodel, ffconfig)
+
+ return cos_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ cos_result = test_cos(ffconfig, input_data)
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying cos function:")
+ print(cos_result)
diff --git a/examples/python/native/ops/dense.py b/examples/python/native/ops/dense.py
new file mode 100644
index 0000000000..ec0a3dc65b
--- /dev/null
+++ b/examples/python/native/ops/dense.py
@@ -0,0 +1,38 @@
+# The basis for this test of the 'dense' layer is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_dense(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ out = ffmodel.dense(input_tensor, 64, activation=ActiMode.AC_MODE_RELU)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input = np.random.randn(ffconfig.batch_size, 10).astype(np.float32)
+ _ = test_dense(ffconfig, input)
diff --git a/examples/python/native/ops/divide.py b/examples/python/native/ops/divide.py
new file mode 100644
index 0000000000..419bf714ab
--- /dev/null
+++ b/examples/python/native/ops/divide.py
@@ -0,0 +1,48 @@
+# The basis for this test of the 'divide' operation is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_divide(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT)
+ input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT)
+
+ out = ffmodel.divide(input_tensor1, input_tensor2)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1)
+ dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2)
+
+ ffmodel.init_layers()
+
+ dataloader_input1.reset()
+ dataloader_input1.next_batch(ffmodel)
+
+ dataloader_input2.reset()
+ dataloader_input2.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ input2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+ # Avoid division by zero in input2
+ input2 = np.where(input2 == 0, 1e-6, input2)
+
+ _ = test_divide(ffconfig, input1, input2)
diff --git a/examples/python/native/ops/dropout.py b/examples/python/native/ops/dropout.py
new file mode 100644
index 0000000000..3aa44a5a5b
--- /dev/null
+++ b/examples/python/native/ops/dropout.py
@@ -0,0 +1,49 @@
+# The basis for this test of the 'Dropout' layer is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_dropout(ffconfig, input_arr: np.ndarray, dropout_rate: float = 0.5) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ # Apply Dropout layer
+ out = ffmodel.dropout(input_tensor, dropout_rate, 0)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+ # You can adjust the dropout rate as needed
+ dropout_rate_param = 0.5
+
+ result = test_dropout(ffconfig, input_data, dropout_rate_param)
+
+ print("Input Data:")
+ print(input_data)
+
+ print("\nResult after Dropout layer:")
+ print(result)
diff --git a/examples/python/native/ops/elu.py b/examples/python/native/ops/elu.py
new file mode 100644
index 0000000000..7a6ef1f621
--- /dev/null
+++ b/examples/python/native/ops/elu.py
@@ -0,0 +1,47 @@
+# The basis for this test of the 'ELU' activation function is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_elu(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ # Apply ELU activation
+ out = ffmodel.elu(input_tensor)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+ result = test_elu(ffconfig, input_data)
+
+ print("Input Data:")
+ print(input_data)
+
+ print("\nResult after ELU activation:")
+ print(result)
diff --git a/examples/python/native/ops/embedding.py b/examples/python/native/ops/embedding.py
new file mode 100644
index 0000000000..34bced3798
--- /dev/null
+++ b/examples/python/native/ops/embedding.py
@@ -0,0 +1,39 @@
+# The basis for this test of the 'embedding' layer is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_embedding(ffconfig, input_arr: np.ndarray, vocab_size: int, embedding_dim: int) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_INT32)
+
+ out = ffmodel.embedding(input_tensor, vocab_size, embedding_dim, AggrMode.AGGR_MODE_SUM)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ vocab_size = 1000
+ embedding_dim = 50
+ input = np.random.randint(low=0, high=vocab_size, size=(ffconfig.batch_size, 10), dtype=np.int32)
+ _ = test_embedding(ffconfig, input, vocab_size, embedding_dim)
diff --git a/examples/python/native/ops/exp.py b/examples/python/native/ops/exp.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/examples/python/native/ops/flat.py b/examples/python/native/ops/flat.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/examples/python/native/ops/gather.py b/examples/python/native/ops/gather.py
new file mode 100644
index 0000000000..e13b6e4c75
--- /dev/null
+++ b/examples/python/native/ops/gather.py
@@ -0,0 +1,60 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_gather(ffconfig, input_arr: np.ndarray, index_arr: np.ndarray, dim: int, name=None):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+ index_tensor = ffmodel.create_tensor(index_arr.shape, DataType.DT_INT32)
+
+ gather_output = ffmodel.gather(
+ input_tensor,
+ index_tensor,
+ dim,
+ name="gather_layer"
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+ dataloader_index = ffmodel.create_data_loader(index_tensor, index_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_index.reset()
+
+ dataloader_input.next_batch(ffmodel)
+ dataloader_index.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ gather_output.inline_map(ffmodel, ffconfig)
+ output_result = gather_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ index_data = np.random.randint(0, 5, size=(ffconfig.batch_size,)).astype(np.int32)
+ dim_to_gather = 2 # Example dimension to gather along
+
+ output_result = test_gather(ffconfig, input_data, index_data, dim=dim_to_gather)
+
+ print("Input Array:")
+ print(input_data)
+ print("\nIndex Array:")
+ print(index_data)
+ print(f"\nOutput Array after applying gather along dimension {dim_to_gather}:")
+ print(output_result)
diff --git a/examples/python/native/ops/gelu.py b/examples/python/native/ops/gelu.py
new file mode 100644
index 0000000000..84fabd36e1
--- /dev/null
+++ b/examples/python/native/ops/gelu.py
@@ -0,0 +1,51 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_gelu(ffconfig, input_arr: np.ndarray, inplace: bool = True, name=None):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ gelu_output = ffmodel.gelu(
+ input_tensor,
+ inplace=inplace,
+ name="gelu_layer"
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ gelu_output.inline_map(ffmodel, ffconfig)
+ output_result = gelu_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ inplace_flag = True # Example inplace flag
+
+ output_result = test_gelu(ffconfig, input_data, inplace=inplace_flag)
+
+ print("Input Array:")
+ print(input_data)
+ print(f"\nOutput Array after applying gelu activation function (inplace={inplace_flag}):")
+ print(output_result)
diff --git a/examples/python/native/ops/identity.py b/examples/python/native/ops/identity.py
new file mode 100644
index 0000000000..fbf63e717c
--- /dev/null
+++ b/examples/python/native/ops/identity.py
@@ -0,0 +1,49 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_identity(ffconfig, input_arr: np.ndarray, name=None):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ identity_output = ffmodel.identity(
+ input_tensor,
+ name="identity_layer"
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ identity_output.inline_map(ffmodel, ffconfig)
+ output_result = identity_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+ output_result = test_identity(ffconfig, input_data)
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying identity function:")
+ print(output_result)
diff --git a/examples/python/native/ops/inc_multihead_self_attention.py b/examples/python/native/ops/inc_multihead_self_attention.py
new file mode 100644
index 0000000000..dce7bd565d
--- /dev/null
+++ b/examples/python/native/ops/inc_multihead_self_attention.py
@@ -0,0 +1,103 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_inc_multihead_self_attention(
+ ffconfig,
+ input_arr: np.ndarray,
+ embed_dim: int,
+ num_heads: int,
+ kdim: int = 0,
+ vdim: int = 0,
+ dropout: float = 0.0,
+ bias: bool = True,
+ add_bias_kv: bool = False,
+ add_zero_attn: bool = False,
+ data_type: DataType = DataType.DT_NONE,
+ kernel_initializer=None,
+ apply_rotary_embedding: bool = False,
+ scaling_query: bool = False,
+ scaling_factor: float = 1.0,
+ qk_prod_scaling: bool = True,
+ position_bias: bool = False,
+ name=None,
+):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, data_type)
+
+ inc_multihead_self_attention_output = ffmodel.inc_multihead_self_attention(
+ input_tensor,
+ embed_dim,
+ num_heads,
+ kdim=kdim,
+ vdim=vdim,
+ dropout=dropout,
+ bias=bias,
+ add_bias_kv=add_bias_kv,
+ add_zero_attn=add_zero_attn,
+ data_type=data_type,
+ kernel_initializer=kernel_initializer,
+ apply_rotary_embedding=apply_rotary_embedding,
+ scaling_query=scaling_query,
+ scaling_factor=scaling_factor,
+ qk_prod_scaling=qk_prod_scaling,
+ position_bias=position_bias,
+ name="inc_multihead_self_attention_layer",
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ inc_multihead_self_attention_output.inline_map(ffmodel, ffconfig)
+ output_result = inc_multihead_self_attention_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32)
+ embed_dim_value = 64
+ num_heads_value = 8
+
+ output_result = test_inc_multihead_self_attention(
+ ffconfig,
+ input_data,
+ embed_dim=embed_dim_value,
+ num_heads=num_heads_value,
+ kdim=0, # Example value for kdim
+ vdim=0, # Example value for vdim
+ dropout=0.1, # Example value for dropout
+ bias=True,
+ add_bias_kv=False,
+ add_zero_attn=False,
+ data_type=DataType.DT_FLOAT,
+ kernel_initializer=None, # Example value for kernel_initializer
+ apply_rotary_embedding=False,
+ scaling_query=False,
+ scaling_factor=1.0,
+ qk_prod_scaling=True,
+ position_bias=False,
+ )
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying inc_multihead_self_attention:")
+ print(output_result)
diff --git a/examples/python/native/ops/inc_multihead_self_attention_verify.py b/examples/python/native/ops/inc_multihead_self_attention_verify.py
new file mode 100644
index 0000000000..f6dc8e3933
--- /dev/null
+++ b/examples/python/native/ops/inc_multihead_self_attention_verify.py
@@ -0,0 +1,103 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_inc_multihead_self_attention_verify(
+ ffconfig,
+ input_arr: np.ndarray,
+ embed_dim: int,
+ num_heads: int,
+ kdim: int = 0,
+ vdim: int = 0,
+ dropout: float = 0.0,
+ bias: bool = True,
+ add_bias_kv: bool = False,
+ add_zero_attn: bool = False,
+ data_type: DataType = DataType.DT_NONE,
+ kernel_initializer=None,
+ apply_rotary_embedding: bool = False,
+ scaling_query: bool = False,
+ scaling_factor: float = 1.0,
+ qk_prod_scaling: bool = True,
+ position_bias: bool = False,
+ name=None,
+):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, data_type)
+
+ inc_multihead_self_attention_verify_output = ffmodel.inc_multihead_self_attention_verify(
+ input_tensor,
+ embed_dim,
+ num_heads,
+ kdim=kdim,
+ vdim=vdim,
+ dropout=dropout,
+ bias=bias,
+ add_bias_kv=add_bias_kv,
+ add_zero_attn=add_zero_attn,
+ data_type=data_type,
+ kernel_initializer=kernel_initializer,
+ apply_rotary_embedding=apply_rotary_embedding,
+ scaling_query=scaling_query,
+ scaling_factor=scaling_factor,
+ qk_prod_scaling=qk_prod_scaling,
+ position_bias=position_bias,
+ name="inc_multihead_self_attention_verify_layer",
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ inc_multihead_self_attention_verify_output.inline_map(ffmodel, ffconfig)
+ output_result = inc_multihead_self_attention_verify_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32)
+ embed_dim_value = 64
+ num_heads_value = 8
+
+ output_result = test_inc_multihead_self_attention_verify(
+ ffconfig,
+ input_data,
+ embed_dim=embed_dim_value,
+ num_heads=num_heads_value,
+ kdim=0, # Example value for kdim
+ vdim=0, # Example value for vdim
+ dropout=0.1, # Example value for dropout
+ bias=True,
+ add_bias_kv=False,
+ add_zero_attn=False,
+ data_type=DataType.DT_FLOAT,
+ kernel_initializer=None, # Example value for kernel_initializer
+ apply_rotary_embedding=False,
+ scaling_query=False,
+ scaling_factor=1.0,
+ qk_prod_scaling=True,
+ position_bias=False,
+ )
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying inc_multihead_self_attention_verify:")
+ print(output_result)
diff --git a/examples/python/native/ops/inc_multiquery_self_attention.py b/examples/python/native/ops/inc_multiquery_self_attention.py
new file mode 100644
index 0000000000..33390ab1f6
--- /dev/null
+++ b/examples/python/native/ops/inc_multiquery_self_attention.py
@@ -0,0 +1,107 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_inc_multiquery_self_attention(
+ ffconfig,
+ input_arr: np.ndarray,
+ embed_dim: int,
+ num_q_heads: int,
+ num_kv_heads: int,
+ kdim: int = 0,
+ vdim: int = 0,
+ dropout: float = 0.0,
+ bias: bool = True,
+ add_bias_kv: bool = False,
+ add_zero_attn: bool = False,
+ data_type: DataType = DataType.DT_NONE,
+ kernel_initializer=None,
+ apply_rotary_embedding: bool = False,
+ scaling_query: bool = False,
+ scaling_factor: float = 1.0,
+ qk_prod_scaling: bool = True,
+ position_bias: bool = False,
+ name=None,
+):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, data_type)
+
+ inc_multiquery_self_attention_output = ffmodel.inc_multiquery_self_attention(
+ input_tensor,
+ embed_dim,
+ num_q_heads,
+ num_kv_heads,
+ kdim=kdim,
+ vdim=vdim,
+ dropout=dropout,
+ bias=bias,
+ add_bias_kv=add_bias_kv,
+ add_zero_attn=add_zero_attn,
+ data_type=data_type,
+ kernel_initializer=kernel_initializer,
+ apply_rotary_embedding=apply_rotary_embedding,
+ scaling_query=scaling_query,
+ scaling_factor=scaling_factor,
+ qk_prod_scaling=qk_prod_scaling,
+ position_bias=position_bias,
+ name="inc_multiquery_self_attention_layer",
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ inc_multiquery_self_attention_output.inline_map(ffmodel, ffconfig)
+ output_result = inc_multiquery_self_attention_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32)
+ embed_dim_value = 64
+ num_q_heads_value = 4
+ num_kv_heads_value = 4
+
+ output_result = test_inc_multiquery_self_attention(
+ ffconfig,
+ input_data,
+ embed_dim=embed_dim_value,
+ num_q_heads=num_q_heads_value,
+ num_kv_heads=num_kv_heads_value,
+ kdim=0, # Example value for kdim
+ vdim=0, # Example value for vdim
+ dropout=0.1, # Example value for dropout
+ bias=True,
+ add_bias_kv=False,
+ add_zero_attn=False,
+ data_type=DataType.DT_FLOAT,
+ kernel_initializer=None, # Example value for kernel_initializer
+ apply_rotary_embedding=False,
+ scaling_query=False,
+ scaling_factor=1.0,
+ qk_prod_scaling=True,
+ position_bias=False,
+ )
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying inc_multiquery_self_attention:")
+ print(output_result)
diff --git a/examples/python/native/ops/inc_multiquery_self_attention_verify.py b/examples/python/native/ops/inc_multiquery_self_attention_verify.py
new file mode 100644
index 0000000000..69a76f68bf
--- /dev/null
+++ b/examples/python/native/ops/inc_multiquery_self_attention_verify.py
@@ -0,0 +1,107 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_inc_multiquery_self_attention_verify(
+ ffconfig,
+ input_arr: np.ndarray,
+ embed_dim: int,
+ num_q_heads: int,
+ num_kv_heads: int,
+ kdim: int = 0,
+ vdim: int = 0,
+ dropout: float = 0.0,
+ bias: bool = True,
+ add_bias_kv: bool = False,
+ add_zero_attn: bool = False,
+ data_type: DataType = DataType.DT_NONE,
+ kernel_initializer=None,
+ apply_rotary_embedding: bool = False,
+ scaling_query: bool = False,
+ scaling_factor: float = 1.0,
+ qk_prod_scaling: bool = True,
+ position_bias: bool = False,
+ name=None,
+):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, data_type)
+
+ inc_multiquery_self_attention_verify_output = ffmodel.inc_multiquery_self_attention_verify(
+ input_tensor,
+ embed_dim,
+ num_q_heads,
+ num_kv_heads,
+ kdim=kdim,
+ vdim=vdim,
+ dropout=dropout,
+ bias=bias,
+ add_bias_kv=add_bias_kv,
+ add_zero_attn=add_zero_attn,
+ data_type=data_type,
+ kernel_initializer=kernel_initializer,
+ apply_rotary_embedding=apply_rotary_embedding,
+ scaling_query=scaling_query,
+ scaling_factor=scaling_factor,
+ qk_prod_scaling=qk_prod_scaling,
+ position_bias=position_bias,
+ name="inc_multiquery_self_attention_verify_layer",
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ inc_multiquery_self_attention_verify_output.inline_map(ffmodel, ffconfig)
+ output_result = inc_multiquery_self_attention_verify_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32)
+ embed_dim_value = 64
+ num_q_heads_value = 4
+ num_kv_heads_value = 4
+
+ output_result = test_inc_multiquery_self_attention_verify(
+ ffconfig,
+ input_data,
+ embed_dim=embed_dim_value,
+ num_q_heads=num_q_heads_value,
+ num_kv_heads=num_kv_heads_value,
+ kdim=0, # Example value for kdim
+ vdim=0, # Example value for vdim
+ dropout=0.1, # Example value for dropout
+ bias=True,
+ add_bias_kv=False,
+ add_zero_attn=False,
+ data_type=DataType.DT_FLOAT,
+ kernel_initializer=None, # Example value for kernel_initializer
+ apply_rotary_embedding=False,
+ scaling_query=False,
+ scaling_factor=1.0,
+ qk_prod_scaling=True,
+ position_bias=False,
+ )
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying inc_multiquery_self_attention_verify:")
+ print(output_result)
diff --git a/examples/python/native/ops/layer_norm.py b/examples/python/native/ops/layer_norm.py
new file mode 100644
index 0000000000..b3cca93d6e
--- /dev/null
+++ b/examples/python/native/ops/layer_norm.py
@@ -0,0 +1,48 @@
+from typing import List
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_layer_norm(ffconfig, input_arr: np.ndarray, axes: List[int], elementwise_affine: bool = True, eps: float = 1e-5, use_bias: bool = True, name=None) -> np.ndarray:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ layer_norm_output = ffmodel.layer_norm(input_tensor, axes=axes, elementwise_affine=elementwise_affine, eps=eps, use_bias=use_bias, name="layer_norm_layer")
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+ ffmodel.forward()
+
+ layer_norm_output.inline_map(ffmodel, ffconfig)
+ layer_norm_result = layer_norm_output.get_array(ffmodel, ffconfig)
+
+ return layer_norm_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ axes_to_normalize = [1, 2] # Example axes to normalize
+
+ layer_norm_result = test_layer_norm(ffconfig, input_data, axes=axes_to_normalize, elementwise_affine=True, eps=1e-5, use_bias=True)
+
+ print("Input Array:")
+ print(input_data)
+ print(f"\nOutput Array after applying layer_norm function along axes {axes_to_normalize}:")
+ print(layer_norm_result)
diff --git a/examples/python/native/ops/max.py b/examples/python/native/ops/max.py
new file mode 100644
index 0000000000..bf9c629406
--- /dev/null
+++ b/examples/python/native/ops/max.py
@@ -0,0 +1,54 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_max(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> np.ndarray:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT)
+ input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT)
+
+ max_output = ffmodel.max(input_tensor1, input_tensor2, name="max_layer")
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1)
+ dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2)
+
+ ffmodel.init_layers()
+
+ dataloader_input1.reset()
+ dataloader_input2.reset()
+
+ dataloader_input1.next_batch(ffmodel)
+ dataloader_input2.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ max_output.inline_map(ffmodel, ffconfig)
+ max_result = max_output.get_array(ffmodel, ffconfig)
+
+ return max_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ input_data2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+ max_result = test_max(ffconfig, input_data1, input_data2)
+
+ print("Input Array 1:")
+ print(input_data1)
+ print("\nInput Array 2:")
+ print(input_data2)
+ print("\nOutput Array after applying max function:")
+ print(max_result)
diff --git a/examples/python/native/ops/mean.py b/examples/python/native/ops/mean.py
new file mode 100644
index 0000000000..df8c3f642e
--- /dev/null
+++ b/examples/python/native/ops/mean.py
@@ -0,0 +1,48 @@
+from typing import List
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_mean(ffconfig, input_arr: np.ndarray, dims: List[int], keepdims: bool = False) -> np.ndarray:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ mean_output = ffmodel.mean(input_tensor, dims=dims, keepdims=keepdims, name="mean_layer")
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+ ffmodel.forward()
+
+ mean_output.inline_map(ffmodel, ffconfig)
+ mean_result = mean_output.get_array(ffmodel, ffconfig)
+
+ return mean_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ dims_to_mean = [1, 2] # Example dimensions to take the mean over
+
+ mean_result = test_mean(ffconfig, input_data, dims=dims_to_mean, keepdims=False)
+
+ print("Input Array:")
+ print(input_data)
+ print(f"\nOutput Array after applying mean function along dimensions {dims_to_mean}:")
+ print(mean_result)
diff --git a/examples/python/native/ops/min.py b/examples/python/native/ops/min.py
new file mode 100644
index 0000000000..df81f4f2d2
--- /dev/null
+++ b/examples/python/native/ops/min.py
@@ -0,0 +1,54 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_min(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> np.ndarray:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT)
+ input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT)
+
+ min_output = ffmodel.min(input_tensor1, input_tensor2, name="min_layer")
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1)
+ dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2)
+
+ ffmodel.init_layers()
+
+ dataloader_input1.reset()
+ dataloader_input2.reset()
+
+ dataloader_input1.next_batch(ffmodel)
+ dataloader_input2.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ min_output.inline_map(ffmodel, ffconfig)
+ min_result = min_output.get_array(ffmodel, ffconfig)
+
+ return min_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ input_data2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+ min_result = test_min(ffconfig, input_data1, input_data2)
+
+ print("Input Array 1:")
+ print(input_data1)
+ print("\nInput Array 2:")
+ print(input_data2)
+ print("\nOutput Array after applying min function:")
+ print(min_result)
diff --git a/examples/python/native/ops/multihead_attention.py b/examples/python/native/ops/multihead_attention.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/examples/python/native/ops/multiply.py b/examples/python/native/ops/multiply.py
new file mode 100644
index 0000000000..fb4f489150
--- /dev/null
+++ b/examples/python/native/ops/multiply.py
@@ -0,0 +1,45 @@
+# The basis for this test of the 'multiply' operation is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_multiply(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT)
+ input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT)
+
+ out = ffmodel.multiply(input_tensor1, input_tensor2)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1)
+ dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2)
+
+ ffmodel.init_layers()
+
+ dataloader_input1.reset()
+ dataloader_input1.next_batch(ffmodel)
+
+ dataloader_input2.reset()
+ dataloader_input2.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ input2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+ _ = test_multiply(ffconfig, input1, input2)
diff --git a/examples/python/native/ops/pool2d.py b/examples/python/native/ops/pool2d.py
new file mode 100644
index 0000000000..b4dc8b219e
--- /dev/null
+++ b/examples/python/native/ops/pool2d.py
@@ -0,0 +1,36 @@
+# AI generated from conv2d example
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_pool2d(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ out = ffmodel.pool2d(input_tensor, 3, 3, 1, 1, 0, 0, PoolType.POOL_MAX)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ _ = test_pool2d(ffconfig, input)
\ No newline at end of file
diff --git a/examples/python/native/ops/pow.py b/examples/python/native/ops/pow.py
new file mode 100644
index 0000000000..cf5bbebd80
--- /dev/null
+++ b/examples/python/native/ops/pow.py
@@ -0,0 +1,46 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_pow(ffconfig, input_arr: np.ndarray, exponent: float) -> np.ndarray:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ pow_output = ffmodel.pow(input_tensor, exponent, name="pow_layer")
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+ ffmodel.forward()
+
+ pow_output.inline_map(ffmodel, ffconfig)
+ pow_result = pow_output.get_array(ffmodel, ffconfig)
+
+ return pow_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ exponent_value = 2.0 # Example exponent value
+
+ pow_result = test_pow(ffconfig, input_data, exponent=exponent_value)
+
+ print("Input Array:")
+ print(input_data)
+ print(f"\nOutput Array after applying pow function with exponent {exponent_value}:")
+ print(pow_result)
diff --git a/examples/python/native/ops/reduce_sum.py b/examples/python/native/ops/reduce_sum.py
new file mode 100644
index 0000000000..7e7b41b799
--- /dev/null
+++ b/examples/python/native/ops/reduce_sum.py
@@ -0,0 +1,48 @@
+from typing import List
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_reduce_sum(ffconfig, input_arr: np.ndarray, axes: List[int], keepdims: bool = False) -> np.ndarray:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ reduce_sum_output = ffmodel.reduce_sum(input_tensor, axes=axes, keepdims=keepdims, name="reduce_sum_layer")
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+ ffmodel.forward()
+
+ reduce_sum_output.inline_map(ffmodel, ffconfig)
+ reduce_sum_result = reduce_sum_output.get_array(ffmodel, ffconfig)
+
+ return reduce_sum_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ axes_to_reduce = [1, 2] # Example axes to reduce
+
+ reduce_sum_result = test_reduce_sum(ffconfig, input_data, axes=axes_to_reduce, keepdims=False)
+
+ print("Input Array:")
+ print(input_data)
+ print(f"\nOutput Array after applying reduce_sum along axes {axes_to_reduce}:")
+ print(reduce_sum_result)
diff --git a/examples/python/native/ops/relu.py b/examples/python/native/ops/relu.py
new file mode 100644
index 0000000000..d855b27164
--- /dev/null
+++ b/examples/python/native/ops/relu.py
@@ -0,0 +1,46 @@
+# The basis for this test of the 'ReLU' activation function is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_relu(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ # Apply ReLU activation
+ out = ffmodel.relu(input_tensor)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+ result = test_relu(ffconfig, input_data)
+
+ print("Input Data:")
+ print(input_data)
+
+ print("\nResult after ReLU activation:")
+ print(result)
diff --git a/examples/python/native/ops/reshape.py b/examples/python/native/ops/reshape.py
new file mode 100644
index 0000000000..348d6bd935
--- /dev/null
+++ b/examples/python/native/ops/reshape.py
@@ -0,0 +1,41 @@
+# The basis for this test of the 'reshape' operation is generated by ChatGPT using the manually created conv2d.py as a template.
+
+from typing import List
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_reshape(ffconfig, input_arr: np.ndarray, target_shape: List[int]) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ out = ffmodel.reshape(input_tensor, target_shape)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ target_shape = [ffconfig.batch_size, 500]
+
+ _ = test_reshape(ffconfig, input, target_shape)
diff --git a/examples/python/native/ops/residual_layer_norm.py b/examples/python/native/ops/residual_layer_norm.py
new file mode 100644
index 0000000000..e12f2e53d9
--- /dev/null
+++ b/examples/python/native/ops/residual_layer_norm.py
@@ -0,0 +1,93 @@
+from typing import List
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_residual_layer_norm(ffconfig, input_arr: np.ndarray, residual1_arr: np.ndarray, residual2_arr: np.ndarray, use_two_residuals: bool, axes: List[int], elementwise_affine: bool = True, eps: float = 1e-5, use_bias: bool = True, name=None):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+ residual1_tensor = ffmodel.create_tensor(residual1_arr.shape, DataType.DT_FLOAT)
+ residual2_tensor = ffmodel.create_tensor(residual2_arr.shape, DataType.DT_FLOAT)
+
+ output_tensor, layer_norm_output = ffmodel.residual_layer_norm(
+ input_tensor,
+ residual1_tensor,
+ residual2_tensor if use_two_residuals else None,
+ use_two_residuals,
+ axes=axes,
+ elementwise_affine=elementwise_affine,
+ eps=eps,
+ use_bias=use_bias,
+ name="residual_layer_norm_layer"
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+ dataloader_residual1 = ffmodel.create_data_loader(residual1_tensor, residual1_arr)
+ dataloader_residual2 = ffmodel.create_data_loader(residual2_tensor, residual2_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_residual1.reset()
+ if use_two_residuals:
+ dataloader_residual2.reset()
+
+ dataloader_input.next_batch(ffmodel)
+ dataloader_residual1.next_batch(ffmodel)
+ if use_two_residuals:
+ dataloader_residual2.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ output_tensor.inline_map(ffmodel, ffconfig)
+ layer_norm_output.inline_map(ffmodel, ffconfig)
+ output_result = output_tensor.get_array(ffmodel, ffconfig)
+ layer_norm_result = layer_norm_output.get_array(ffmodel, ffconfig)
+
+ return output_result, layer_norm_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ residual1_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ residual2_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ use_two_residuals_flag = True # Example flag
+
+ axes_to_normalize = [1, 2] # Example axes to normalize
+
+ output_result, layer_norm_result = test_residual_layer_norm(
+ ffconfig,
+ input_data,
+ residual1_data,
+ residual2_data,
+ use_two_residuals_flag,
+ axes=axes_to_normalize,
+ elementwise_affine=True,
+ eps=1e-5,
+ use_bias=True
+ )
+
+ print("Input Array:")
+ print(input_data)
+ print("\nResidual1 Array:")
+ print(residual1_data)
+ if use_two_residuals_flag:
+ print("\nResidual2 Array:")
+ print(residual2_data)
+ print(f"\nOutput Array after applying residual_layer_norm along axes {axes_to_normalize} with use_two_residuals={use_two_residuals_flag}:")
+ print(output_result)
+ print("\nLayer Norm Result:")
+ print(layer_norm_result)
diff --git a/examples/python/native/ops/residual_rms_norm.py b/examples/python/native/ops/residual_rms_norm.py
new file mode 100644
index 0000000000..9027dffada
--- /dev/null
+++ b/examples/python/native/ops/residual_rms_norm.py
@@ -0,0 +1,80 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_residual_rms_norm(
+ ffconfig,
+ input1_arr: np.ndarray,
+ input2_arr: np.ndarray,
+ eps: float,
+ dim: int,
+ name=None,
+):
+ ffmodel = FFModel(ffconfig)
+
+ input1_tensor = ffmodel.create_tensor(input1_arr.shape, DataType.DT_FLOAT)
+ input2_tensor = ffmodel.create_tensor(input2_arr.shape, DataType.DT_FLOAT)
+
+ residual_rms_norm_output1, residual_rms_norm_output2 = ffmodel.residual_rms_norm(
+ input1_tensor,
+ input2_tensor,
+ eps,
+ dim,
+ name="residual_rms_norm_layer",
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input1 = ffmodel.create_data_loader(input1_tensor, input1_arr)
+ dataloader_input2 = ffmodel.create_data_loader(input2_tensor, input2_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input1.reset()
+ dataloader_input1.next_batch(ffmodel)
+
+ dataloader_input2.reset()
+ dataloader_input2.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ residual_rms_norm_output1.inline_map(ffmodel, ffconfig)
+ output_result1 = residual_rms_norm_output1.get_array(ffmodel, ffconfig)
+
+ residual_rms_norm_output2.inline_map(ffmodel, ffconfig)
+ output_result2 = residual_rms_norm_output2.get_array(ffmodel, ffconfig)
+
+ return output_result1, output_result2
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input1_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32)
+ input2_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32)
+ eps_value = 1e-6
+ dim_value = 1 # Example value for dim
+
+ output_result1, output_result2 = test_residual_rms_norm(
+ ffconfig,
+ input1_data,
+ input2_data,
+ eps=eps_value,
+ dim=dim_value,
+ )
+
+ print("Input Array 1:")
+ print(input1_data)
+ print("\nInput Array 2:")
+ print(input2_data)
+ print("\nOutput Array 1 after applying residual_rms_norm:")
+ print(output_result1)
+ print("\nOutput Array 2 after applying residual_rms_norm:")
+ print(output_result2)
diff --git a/examples/python/native/ops/reverse.py b/examples/python/native/ops/reverse.py
new file mode 100644
index 0000000000..25394d4b9a
--- /dev/null
+++ b/examples/python/native/ops/reverse.py
@@ -0,0 +1,37 @@
+# The basis for this test of the 'reverse' operation is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_reverse(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ out = ffmodel.reverse(input_tensor, axis=2)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ _ = test_reverse(ffconfig, input)
diff --git a/examples/python/native/ops/rms_norm.py b/examples/python/native/ops/rms_norm.py
new file mode 100644
index 0000000000..3983d7f891
--- /dev/null
+++ b/examples/python/native/ops/rms_norm.py
@@ -0,0 +1,64 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_rms_norm(
+ ffconfig,
+ input_arr: np.ndarray,
+ eps: float,
+ dim: int,
+ name=None,
+):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ rms_norm_output = ffmodel.rms_norm(
+ input_tensor,
+ eps,
+ dim,
+ name="rms_norm_layer",
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_MEAN_SQUARED_ERROR, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY],
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ rms_norm_output.inline_map(ffmodel, ffconfig)
+ output_result = rms_norm_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32)
+ eps_value = 1e-6
+ dim_value = 1 # Example value for dim
+
+ output_result = test_rms_norm(
+ ffconfig,
+ input_data,
+ eps=eps_value,
+ dim=dim_value,
+ )
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying rms_norm:")
+ print(output_result)
diff --git a/examples/python/native/ops/rsqrt.py b/examples/python/native/ops/rsqrt.py
new file mode 100644
index 0000000000..3d9ab65449
--- /dev/null
+++ b/examples/python/native/ops/rsqrt.py
@@ -0,0 +1,44 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_rsqrt(ffconfig, input_arr: np.ndarray) -> np.ndarray:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ rsqrt_output = ffmodel.rsqrt(input_tensor, name="rsqrt_layer")
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+ ffmodel.forward()
+
+ rsqrt_output.inline_map(ffmodel, ffconfig)
+ rsqrt_result = rsqrt_output.get_array(ffmodel, ffconfig)
+
+ return rsqrt_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ rsqrt_result = test_rsqrt(ffconfig, input_data)
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying rsqrt function:")
+ print(rsqrt_result)
diff --git a/examples/python/native/ops/sampling.py b/examples/python/native/ops/sampling.py
new file mode 100644
index 0000000000..2219f09eff
--- /dev/null
+++ b/examples/python/native/ops/sampling.py
@@ -0,0 +1,55 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_sampling(ffconfig, input_arr: np.ndarray, top_p: float, name=None):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ sampling_output = ffmodel.sampling(
+ input_tensor,
+ top_p,
+ name="sampling_layer",
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_MEAN_SQUARED_ERROR,
+ metrics=[MetricsType.METRICS_MEAN_SQUARED_ERROR],
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ sampling_output.inline_map(ffmodel, ffconfig)
+ output_result = sampling_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 10).astype(np.float32)
+ top_p_value = 0.8
+
+ output_result = test_sampling(
+ ffconfig,
+ input_data,
+ top_p=top_p_value,
+ )
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying sampling:")
+ print(output_result)
diff --git a/examples/python/native/ops/scalar_add.py b/examples/python/native/ops/scalar_add.py
new file mode 100644
index 0000000000..48a316ea8a
--- /dev/null
+++ b/examples/python/native/ops/scalar_add.py
@@ -0,0 +1,53 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_scalar_add(ffconfig, input_arr: np.ndarray, scalar: float, inplace: bool = True, name=None):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ scalar_add_output = ffmodel.scalar_add(
+ input_tensor,
+ scalar,
+ inplace=inplace,
+ name="scalar_add_layer"
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ scalar_add_output.inline_map(ffmodel, ffconfig)
+ output_result = scalar_add_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ scalar_value = 2.0 # Example scalar value
+ inplace_flag = True # Example inplace flag
+
+ output_result = test_scalar_add(ffconfig, input_data, scalar=scalar_value, inplace=inplace_flag)
+
+ print("Input Array:")
+ print(input_data)
+ print(f"\nOutput Array after applying scalar addition with scalar value {scalar_value} (inplace={inplace_flag}):")
+ print(output_result)
diff --git a/examples/python/native/ops/scalar_multiply.py b/examples/python/native/ops/scalar_multiply.py
new file mode 100644
index 0000000000..ebae5cce01
--- /dev/null
+++ b/examples/python/native/ops/scalar_multiply.py
@@ -0,0 +1,53 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_scalar_multiply(ffconfig, input_arr: np.ndarray, scalar: float, inplace: bool = True, name=None):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ scalar_multiply_output = ffmodel.scalar_multiply(
+ input_tensor,
+ scalar,
+ inplace=inplace,
+ name="scalar_multiply_layer"
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ scalar_multiply_output.inline_map(ffmodel, ffconfig)
+ output_result = scalar_multiply_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ scalar_value = 2.0 # Example scalar value
+ inplace_flag = True # Example inplace flag
+
+ output_result = test_scalar_multiply(ffconfig, input_data, scalar=scalar_value, inplace=inplace_flag)
+
+ print("Input Array:")
+ print(input_data)
+ print(f"\nOutput Array after applying scalar multiplication with scalar value {scalar_value} (inplace={inplace_flag}):")
+ print(output_result)
diff --git a/examples/python/native/ops/scalar_sub.py b/examples/python/native/ops/scalar_sub.py
new file mode 100644
index 0000000000..2dc467b573
--- /dev/null
+++ b/examples/python/native/ops/scalar_sub.py
@@ -0,0 +1,53 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_scalar_sub(ffconfig, input_arr: np.ndarray, scalar: float, inplace: bool = True, name=None):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ scalar_sub_output = ffmodel.scalar_sub(
+ input_tensor,
+ scalar,
+ inplace=inplace,
+ name="scalar_sub_layer"
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ scalar_sub_output.inline_map(ffmodel, ffconfig)
+ output_result = scalar_sub_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ scalar_value = 2.0 # Example scalar value
+ inplace_flag = True # Example inplace flag
+
+ output_result = test_scalar_sub(ffconfig, input_data, scalar=scalar_value, inplace=inplace_flag)
+
+ print("Input Array:")
+ print(input_data)
+ print(f"\nOutput Array after applying scalar subtraction with scalar value {scalar_value} (inplace={inplace_flag}):")
+ print(output_result)
diff --git a/examples/python/native/ops/scalar_true_divide.py b/examples/python/native/ops/scalar_true_divide.py
new file mode 100644
index 0000000000..f1b64df506
--- /dev/null
+++ b/examples/python/native/ops/scalar_true_divide.py
@@ -0,0 +1,53 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_scalar_true_divide(ffconfig, input_arr: np.ndarray, scalar: float, inplace: bool = True, name=None):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ scalar_true_divide_output = ffmodel.scalar_true_divide(
+ input_tensor,
+ scalar,
+ inplace=inplace,
+ name="scalar_true_divide_layer"
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ scalar_true_divide_output.inline_map(ffmodel, ffconfig)
+ output_result = scalar_true_divide_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ scalar_value = 2.0 # Example scalar value
+ inplace_flag = True # Example inplace flag
+
+ output_result = test_scalar_true_divide(ffconfig, input_data, scalar=scalar_value, inplace=inplace_flag)
+
+ print("Input Array:")
+ print(input_data)
+ print(f"\nOutput Array after applying scalar true division with scalar value {scalar_value} (inplace={inplace_flag}):")
+ print(output_result)
diff --git a/examples/python/native/ops/sigmoid.py b/examples/python/native/ops/sigmoid.py
new file mode 100644
index 0000000000..0fbe21df45
--- /dev/null
+++ b/examples/python/native/ops/sigmoid.py
@@ -0,0 +1,46 @@
+# The basis for this test of the 'Sigmoid' activation function is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_sigmoid(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ # Apply Sigmoid activation
+ out = ffmodel.sigmoid(input_tensor)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+ result = test_sigmoid(ffconfig, input_data)
+
+ print("Input Data:")
+ print(input_data)
+
+ print("\nResult after Sigmoid activation:")
+ print(result)
diff --git a/examples/python/native/ops/sigmoid_silu_multi.py b/examples/python/native/ops/sigmoid_silu_multi.py
new file mode 100644
index 0000000000..cecc3e102e
--- /dev/null
+++ b/examples/python/native/ops/sigmoid_silu_multi.py
@@ -0,0 +1,58 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_sigmoid_silu_multi(ffconfig, input1_arr: np.ndarray, input2_arr: np.ndarray, name=None):
+ ffmodel = FFModel(ffconfig)
+
+ input1_tensor = ffmodel.create_tensor(input1_arr.shape, DataType.DT_FLOAT)
+ input2_tensor = ffmodel.create_tensor(input2_arr.shape, DataType.DT_FLOAT)
+
+ sigmoid_silu_multi_output = ffmodel.sigmoid_silu_multi(
+ input1_tensor,
+ input2_tensor,
+ name="sigmoid_silu_multi_layer"
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input1 = ffmodel.create_data_loader(input1_tensor, input1_arr)
+ dataloader_input2 = ffmodel.create_data_loader(input2_tensor, input2_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input1.reset()
+ dataloader_input2.reset()
+
+ dataloader_input1.next_batch(ffmodel)
+ dataloader_input2.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ sigmoid_silu_multi_output.inline_map(ffmodel, ffconfig)
+ output_result = sigmoid_silu_multi_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input1_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ input2_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+ output_result = test_sigmoid_silu_multi(ffconfig, input1_data, input2_data)
+
+ print("Input1 Array:")
+ print(input1_data)
+ print("\nInput2 Array:")
+ print(input2_data)
+ print("\nOutput Array after applying sigmoid_silu_multi:")
+ print(output_result)
diff --git a/examples/python/native/ops/sin.py b/examples/python/native/ops/sin.py
new file mode 100644
index 0000000000..4b60a4e1d4
--- /dev/null
+++ b/examples/python/native/ops/sin.py
@@ -0,0 +1,44 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_sin(ffconfig, input_arr: np.ndarray) -> np.ndarray:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ sin_output = ffmodel.sin(input_tensor, name="sin_layer")
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+ ffmodel.forward()
+
+ sin_output.inline_map(ffmodel, ffconfig)
+ sin_result = sin_output.get_array(ffmodel, ffconfig)
+
+ return sin_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ sin_result = test_sin(ffconfig, input_data)
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying sin function:")
+ print(sin_result)
diff --git a/examples/python/native/ops/softmax.py b/examples/python/native/ops/softmax.py
new file mode 100644
index 0000000000..b5481bcc80
--- /dev/null
+++ b/examples/python/native/ops/softmax.py
@@ -0,0 +1,46 @@
+# The basis for this test of the 'Softmax' activation function is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_softmax(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ # Apply Softmax activation
+ out = ffmodel.softmax(input_tensor)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10).astype(np.float32)
+
+ result = test_softmax(ffconfig, input_data)
+
+ print("Input Data:")
+ print(input_data)
+
+ print("\nResult after Softmax activation:")
+ print(result)
diff --git a/examples/python/native/ops/spec_inc_multihead_self_attention.py b/examples/python/native/ops/spec_inc_multihead_self_attention.py
new file mode 100644
index 0000000000..bd1aaa189b
--- /dev/null
+++ b/examples/python/native/ops/spec_inc_multihead_self_attention.py
@@ -0,0 +1,103 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_spec_inc_multihead_self_attention(
+ ffconfig,
+ input_arr: np.ndarray,
+ embed_dim: int,
+ num_heads: int,
+ kdim: int = 0,
+ vdim: int = 0,
+ dropout: float = 0.0,
+ bias: bool = True,
+ add_bias_kv: bool = False,
+ add_zero_attn: bool = False,
+ data_type: DataType = DataType.DT_NONE,
+ kernel_initializer=None,
+ apply_rotary_embedding: bool = False,
+ scaling_query: bool = False,
+ scaling_factor: float = 1.0,
+ qk_prod_scaling: bool = True,
+ position_bias: bool = False,
+ name=None,
+):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, data_type)
+
+ spec_inc_multihead_self_attention_output = ffmodel.spec_inc_multihead_self_attention(
+ input_tensor,
+ embed_dim,
+ num_heads,
+ kdim=kdim,
+ vdim=vdim,
+ dropout=dropout,
+ bias=bias,
+ add_bias_kv=add_bias_kv,
+ add_zero_attn=add_zero_attn,
+ data_type=data_type,
+ kernel_initializer=kernel_initializer,
+ apply_rotary_embedding=apply_rotary_embedding,
+ scaling_query=scaling_query,
+ scaling_factor=scaling_factor,
+ qk_prod_scaling=qk_prod_scaling,
+ position_bias=position_bias,
+ name="spec_inc_multihead_self_attention_layer",
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ spec_inc_multihead_self_attention_output.inline_map(ffmodel, ffconfig)
+ output_result = spec_inc_multihead_self_attention_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32)
+ embed_dim_value = 64
+ num_heads_value = 8
+
+ output_result = test_spec_inc_multihead_self_attention(
+ ffconfig,
+ input_data,
+ embed_dim=embed_dim_value,
+ num_heads=num_heads_value,
+ kdim=0, # Example value for kdim
+ vdim=0, # Example value for vdim
+ dropout=0.1, # Example value for dropout
+ bias=True,
+ add_bias_kv=False,
+ add_zero_attn=False,
+ data_type=DataType.DT_FLOAT,
+ kernel_initializer=None, # Example value for kernel_initializer
+ apply_rotary_embedding=False,
+ scaling_query=False,
+ scaling_factor=1.0,
+ qk_prod_scaling=True,
+ position_bias=False,
+ )
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying spec_inc_multihead_self_attention:")
+ print(output_result)
diff --git a/examples/python/native/ops/spec_inc_multiquery_self_attention.py b/examples/python/native/ops/spec_inc_multiquery_self_attention.py
new file mode 100644
index 0000000000..0b731c99e0
--- /dev/null
+++ b/examples/python/native/ops/spec_inc_multiquery_self_attention.py
@@ -0,0 +1,107 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_spec_inc_multiquery_self_attention(
+ ffconfig,
+ input_arr: np.ndarray,
+ embed_dim: int,
+ num_q_heads: int,
+ num_kv_heads: int,
+ kdim: int = 0,
+ vdim: int = 0,
+ dropout: float = 0.0,
+ bias: bool = True,
+ add_bias_kv: bool = False,
+ add_zero_attn: bool = False,
+ data_type: DataType = DataType.DT_NONE,
+ kernel_initializer=None,
+ apply_rotary_embedding: bool = False,
+ scaling_query: bool = False,
+ scaling_factor: float = 1.0,
+ qk_prod_scaling: bool = True,
+ position_bias: bool = False,
+ name=None,
+):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, data_type)
+
+ spec_inc_multiquery_self_attention_output = ffmodel.spec_inc_multiquery_self_attention(
+ input_tensor,
+ embed_dim,
+ num_q_heads,
+ num_kv_heads,
+ kdim=kdim,
+ vdim=vdim,
+ dropout=dropout,
+ bias=bias,
+ add_bias_kv=add_bias_kv,
+ add_zero_attn=add_zero_attn,
+ data_type=data_type,
+ kernel_initializer=kernel_initializer,
+ apply_rotary_embedding=apply_rotary_embedding,
+ scaling_query=scaling_query,
+ scaling_factor=scaling_factor,
+ qk_prod_scaling=qk_prod_scaling,
+ position_bias=position_bias,
+ name="spec_inc_multiquery_self_attention_layer",
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ spec_inc_multiquery_self_attention_output.inline_map(ffmodel, ffconfig)
+ output_result = spec_inc_multiquery_self_attention_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32)
+ embed_dim_value = 64
+ num_q_heads_value = 4
+ num_kv_heads_value = 4
+
+ output_result = test_spec_inc_multiquery_self_attention(
+ ffconfig,
+ input_data,
+ embed_dim=embed_dim_value,
+ num_q_heads=num_q_heads_value,
+ num_kv_heads=num_kv_heads_value,
+ kdim=0, # Example value for kdim
+ vdim=0, # Example value for vdim
+ dropout=0.1, # Example value for dropout
+ bias=True,
+ add_bias_kv=False,
+ add_zero_attn=False,
+ data_type=DataType.DT_FLOAT,
+ kernel_initializer=None, # Example value for kernel_initializer
+ apply_rotary_embedding=False,
+ scaling_query=False,
+ scaling_factor=1.0,
+ qk_prod_scaling=True,
+ position_bias=False,
+ )
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying spec_inc_multiquery_self_attention:")
+ print(output_result)
diff --git a/examples/python/native/ops/split.py b/examples/python/native/ops/split.py
new file mode 100644
index 0000000000..d03a52a769
--- /dev/null
+++ b/examples/python/native/ops/split.py
@@ -0,0 +1,47 @@
+# The basis for this test of the 'split' operation is generated by ChatGPT using the manually created conv2d.py as a template.
+
+from typing import List
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_split(ffconfig, input_arr: np.ndarray) -> List[flexflow.core.Tensor]:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ out1, out2 = ffmodel.split(input_tensor, 2, axis=1)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ out1.inline_map(ffmodel, ffconfig)
+ out2.inline_map(ffmodel, ffconfig)
+
+ return [out1.get_array(ffmodel, ffconfig), out2.get_array(ffmodel, ffconfig)]
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input = np.random.randn(ffconfig.batch_size, 10, 10, 10).astype(np.float32)
+ output_list = test_split(ffconfig, input)
+
+ print("Output Tensor 1:")
+ print(output_list[0])
+
+ print("\nOutput Tensor 2:")
+ print(output_list[1])
diff --git a/examples/python/native/ops/subtract.py b/examples/python/native/ops/subtract.py
new file mode 100644
index 0000000000..5f829cbae1
--- /dev/null
+++ b/examples/python/native/ops/subtract.py
@@ -0,0 +1,45 @@
+# The basis for this test of the 'subtract' operation is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_subtract(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT)
+ input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT)
+
+ out = ffmodel.subtract(input_tensor1, input_tensor2)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1)
+ dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2)
+
+ ffmodel.init_layers()
+
+ dataloader_input1.reset()
+ dataloader_input1.next_batch(ffmodel)
+
+ dataloader_input2.reset()
+ dataloader_input2.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ input2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+ _ = test_subtract(ffconfig, input1, input2)
diff --git a/examples/python/native/ops/tanh.py b/examples/python/native/ops/tanh.py
new file mode 100644
index 0000000000..ba4ba7d6ff
--- /dev/null
+++ b/examples/python/native/ops/tanh.py
@@ -0,0 +1,46 @@
+# The basis for this test of the 'tanh' activation function is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_tanh(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ # Apply tanh activation
+ out = ffmodel.tanh(input_tensor)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+ result = test_tanh(ffconfig, input_data)
+
+ print("Input Data:")
+ print(input_data)
+
+ print("\nResult after tanh activation:")
+ print(result)
diff --git a/examples/python/native/ops/transpose.py b/examples/python/native/ops/transpose.py
new file mode 100644
index 0000000000..6f514d660c
--- /dev/null
+++ b/examples/python/native/ops/transpose.py
@@ -0,0 +1,38 @@
+# The basis for this test of the 'transpose' operation is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_transpose(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ out = ffmodel.transpose(input_tensor, [ffconfig.batch_size, 10, 5, 10])
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ _ = test_transpose(ffconfig, input)
diff --git a/examples/python/native/print_layers.py b/examples/python/native/print_layers.py
index 22b87e0b86..481ecc3477 100644
--- a/examples/python/native/print_layers.py
+++ b/examples/python/native/print_layers.py
@@ -119,6 +119,9 @@ def top_level_task():
# ffmodel.print_layers(0)
+
if __name__ == "__main__":
print("alexnet")
+ configs = get_configs()
+ init_flexflow_runtime(configs)
top_level_task()
diff --git a/examples/python/native/split.py b/examples/python/native/split.py
index dfd8b0e572..f79ff04e14 100644
--- a/examples/python/native/split.py
+++ b/examples/python/native/split.py
@@ -77,6 +77,9 @@ def top_level_task():
# if accuracy < ModelAccuracy.CIFAR10_CNN.value:
# assert 0, 'Check Accuracy'
+
if __name__ == "__main__":
print("cifar10 cnn split")
+ configs = get_configs()
+ init_flexflow_runtime(configs)
top_level_task()
diff --git a/img/overview.png b/img/overview.png
new file mode 100644
index 0000000000..5264e2d41a
Binary files /dev/null and b/img/overview.png differ
diff --git a/img/performance.png b/img/performance.png
new file mode 100644
index 0000000000..668e579197
Binary files /dev/null and b/img/performance.png differ
diff --git a/img/spec_infer_demo.gif b/img/spec_infer_demo.gif
new file mode 100644
index 0000000000..c0fda87b71
Binary files /dev/null and b/img/spec_infer_demo.gif differ
diff --git a/include/flexflow/accessor.h b/include/flexflow/accessor.h
index 6f95354823..65ab33b513 100644
--- a/include/flexflow/accessor.h
+++ b/include/flexflow/accessor.h
@@ -61,6 +61,7 @@ class GenericTensorAccessorW {
float *get_float_ptr() const;
double *get_double_ptr() const;
half *get_half_ptr() const;
+ char *get_byte_ptr() const;
DataType data_type;
Legion::Domain domain;
void *ptr;
@@ -79,6 +80,7 @@ class GenericTensorAccessorR {
float const *get_float_ptr() const;
double const *get_double_ptr() const;
half const *get_half_ptr() const;
+ char const *get_byte_ptr() const;
DataType data_type;
Legion::Domain domain;
void const *ptr;
diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
new file mode 100644
index 0000000000..873fed0bdb
--- /dev/null
+++ b/include/flexflow/batch_config.h
@@ -0,0 +1,238 @@
+/* Copyright 2023 CMU, Stanford, Facebook, LANL
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "flexflow/ffconst.h"
+#include "flexflow/fftype.h"
+#include "legion.h"
+#include
+#include
+
+// #define MAX_SEQ_LEN 1024
+// #define BATCH_SIZE 2
+// #define BATCH_SIZE 16
+// #define MAX_REQUESTS 256
+
+namespace FlexFlow {
+
+class InferenceResult;
+class BeamInferenceResult;
+
+using BatchConfigFuture = Legion::Future;
+using InferenceResultFuture = Legion::Future;
+using BeamSearchBatchConfigFuture = Legion::Future;
+using TreeVerifyBatchConfigFuture = Legion::Future;
+using BeamInferenceResultFuture = Legion::Future;
+
+struct OptimizerTasks {
+ bool compute_gradients = true;
+ bool reset_gradients_to_zero = false;
+ bool update_weights = false;
+ bool save_updated_weights = false;
+};
+
+void set_optimizer_tasks(OptimizerTasks &tasks,
+ int max_training_steps,
+ int completed_training_steps,
+ int gradient_accumulation_steps);
+
+class BatchConfig {
+public:
+ using RequestGuid = size_t;
+ using TokenId = int;
+ BatchConfig();
+ int num_active_requests() const;
+ int num_active_tokens() const;
+ int num_active_infr_tokens() const;
+ int num_active_peft_tokens() const;
+ static int max_requests_per_batch();
+ static int max_tokens_per_batch();
+ static int max_verify_tokens_per_batch();
+ static int max_spec_tree_token_num();
+ static int max_sequence_length();
+ friend std::ostream &operator<<(std::ostream &os, BatchConfig const &bc);
+ void print() const;
+ void save_to_file(std::string const &filename) const;
+ virtual InferenceMode get_mode() const;
+ static BatchConfig const *from_future(BatchConfigFuture const &future);
+ // Maximum possible values for different parameters
+ // These maximum values are used for copying BatchConfig
+ // across workers
+ static int const MAX_NUM_REQUESTS = 65;
+ static int const MAX_NUM_TOKENS = 1024;
+ static int const MAX_SPEC_TREE_TOKEN_NUM = 64;
+
+ // Set by update
+
+ int num_tokens = 0, num_peft_tokens = 0, num_peft_label_tokens = 0;
+ // number of tokens in prompt phase, start offset of tokens in inc_decoding
+ // phase. num_tokens - num_prompt_tokens = num_generation_tokens;
+ int num_generation_tokens = 0;
+
+ struct PerRequestInfo {
+ PerRequestInfo() {
+ first_token_depth_in_request = 0;
+ first_token_offset_in_batch = 0;
+ num_tokens_in_batch = 0;
+ max_sequence_length = 0;
+ request_guid = 0;
+ prompt_phase = false;
+ batch_config_request_id = -1;
+ peft_model_id = PEFTModelID::NO_ID;
+ peft_bwd = false;
+ optimizer_tasks = {true, false, false, false};
+ }
+ int first_token_depth_in_request;
+ int first_token_offset_in_batch;
+ int num_tokens_in_batch;
+ int max_sequence_length;
+
+ // request id in batch config:
+ int batch_config_request_id = -1;
+ bool prompt_phase = false;
+ RequestGuid request_guid;
+ // PEFT fields
+ PEFTModelID peft_model_id;
+ bool peft_bwd;
+ OptimizerTasks optimizer_tasks;
+ };
+ struct PerTokenInfo {
+ int abs_depth_in_request;
+ int request_index;
+ TokenId token_id;
+ };
+
+ struct BitMask {
+ unsigned long long mask[MAX_SPEC_TREE_TOKEN_NUM] = {0};
+
+ // how many tokens before the tree, every sub requests need this part of
+ // cache
+ int non_tree_cache_size = 0;
+
+ // current tree size
+ int tree_size = 0;
+
+ int this_layer_size = 0;
+
+ // input length-> prompt/root
+ int prompt_size = 0;
+ };
+
+ BitMask causalMask[MAX_NUM_REQUESTS];
+ PerRequestInfo requestsInfo[MAX_NUM_REQUESTS];
+ PerTokenInfo tokensInfo[MAX_NUM_TOKENS];
+ PerTokenInfo labelsInfo[MAX_NUM_TOKENS];
+
+ bool request_completed[MAX_NUM_REQUESTS];
+ bool request_running[MAX_NUM_REQUESTS];
+};
+
+class TreeVerifyBatchConfig : public BatchConfig {
+public:
+ TreeVerifyBatchConfig();
+ ~TreeVerifyBatchConfig();
+ InferenceMode get_mode() const;
+ friend std::ostream &operator<<(std::ostream &os,
+ TreeVerifyBatchConfig const &bc);
+ void print() const;
+ void save_to_file(std::string const &filename) const;
+ struct CommittedTokensInfo {
+ int token_index; // the index of the token in the previous batch
+ int request_index; // request index in the batch
+ int token_depth; // position of the token in the request's sequence
+ };
+
+ int num_tokens_to_commit;
+ CommittedTokensInfo committed_tokens[MAX_NUM_TOKENS];
+};
+
+struct InferenceResult {
+ static int const MAX_NUM_TOKENS = BatchConfig::MAX_NUM_TOKENS;
+ BatchConfig::TokenId token_ids[MAX_NUM_TOKENS];
+ float finetuning_loss;
+};
+
+class BeamSearchBatchConfig : public BatchConfig {
+public:
+ BeamSearchBatchConfig();
+ BeamSearchBatchConfig(int model_id);
+ BeamSearchBatchConfig(size_t beam_width, size_t target_iterations);
+ BeamSearchBatchConfig(BeamSearchBatchConfig const &other, int model_id);
+ InferenceMode get_mode() const;
+
+ ~BeamSearchBatchConfig();
+
+ friend std::ostream &operator<<(std::ostream &os,
+ BeamSearchBatchConfig const &bc);
+ void print() const;
+ void save_to_file(std::string const &filename) const;
+ bool done() const;
+ int max_beam_depth_all_requests() const;
+ int current_depth_all_requests() const;
+ int get_speculative_request_num() const;
+
+ size_t beam_width;
+ size_t target_iterations;
+
+ // how many requests is in speculative phase
+ int speculative_request_num = 0;
+ inline static int const MAX_BEAM_WIDTH = 3;
+ inline static int const MAX_BEAM_DEPTH = 8;
+
+ // maximum tree branches for a request
+ inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 3;
+
+ int model_id;
+
+ struct BeamSearchPerRequestInfo {
+ int beam_size;
+ int current_depth = -1;
+ int max_depth = MAX_BEAM_DEPTH;
+
+ BatchConfig::TokenId
+ tokens[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+ float probs[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+ int parent_id[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+ int sub_request_num;
+ };
+
+ struct BeamSearchPerTokenInfo {
+ int sub_request_index;
+ };
+
+ BeamSearchPerRequestInfo beamRequestsInfo[MAX_NUM_REQUESTS];
+ BeamSearchPerTokenInfo
+ beamTokenInfo[MAX_NUM_TOKENS +
+ MAX_SPEC_TREE_TOKEN_NUM * MAX_NUM_REQUESTS];
+
+ int sub_requests[MAX_NUM_REQUESTS];
+
+private:
+ size_t current_iteration;
+};
+
+struct BeamInferenceResult {
+ static int const MAX_NUM_TOKENS = BatchConfig::MAX_NUM_TOKENS;
+ BatchConfig::TokenId
+ token_ids[MAX_NUM_TOKENS *
+ BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+ float probs[MAX_NUM_TOKENS *
+ BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+ int parent_id[MAX_NUM_TOKENS *
+ BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+};
+
+}; // namespace FlexFlow
diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index d82b1377c7..dd9d657117 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -16,20 +16,25 @@
#ifndef _FLEXFLOW_CONFIG_H_
#define _FLEXFLOW_CONFIG_H_
#include "ffconst.h"
+#include "flexflow/batch_config.h"
#include "legion.h"
#include
#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
#include
#include
#elif defined(FF_USE_HIP_ROCM)
-#include
+#include
#include
#else
#error "Unknown device"
#endif
#include "tl/optional.hpp"
#ifdef FF_USE_NCCL
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
#include
+#else
+#include
+#endif
#endif
namespace FlexFlow {
@@ -37,14 +42,15 @@ namespace FlexFlow {
// ========================================================
// Define Runtime Constants
// ========================================================
-#define MAX_NUM_INPUTS 256
-#define MAX_NUM_WEIGHTS 64
-#define MAX_NUM_OUTPUTS 256
-#define MAX_NUM_FUSED_OPERATORS 64
-#define MAX_NUM_FUSED_TENSORS 64
+#define MAX_NUM_INPUTS 2048
+#define MAX_NUM_WEIGHTS 2048
+#define MAX_NUM_OUTPUTS 2048
+#define MAX_NUM_FUSED_OPERATORS 2048
+#define MAX_NUM_FUSED_TENSORS 2048
#define MAX_NUM_WORKERS 1024
#define MAX_FILENAME 200
#define MAX_OPNAME 128
+#define MAX_NUM_TRANSFORMER_LAYERS 100
// DataLoader
#define MAX_SAMPLES_PER_LOAD 64
#define MAX_FILE_LENGTH 128
@@ -59,6 +65,25 @@ constexpr ParameterSyncType CHOSEN_SYNC_TYPE = ParameterSyncType::PS;
#endif
class FFConfig;
+class MemoryAllocator;
+class PEFTWeightAllocator;
+
+struct CombinedBatchConfigMetaStruct {
+ BatchConfig::PerTokenInfo tokens_info[BatchConfig::MAX_NUM_TOKENS];
+ BatchConfig::PerRequestInfo requestsInfo[BatchConfig::MAX_NUM_REQUESTS];
+ BatchConfig::BitMask causalMask[BatchConfig::MAX_NUM_REQUESTS];
+ bool request_completed[BatchConfig::MAX_NUM_REQUESTS];
+
+ BeamSearchBatchConfig::BeamSearchPerTokenInfo
+ beamTokenInfo[BeamSearchBatchConfig::MAX_NUM_TOKENS +
+ BeamSearchBatchConfig::MAX_SPEC_TREE_TOKEN_NUM *
+ BeamSearchBatchConfig::MAX_NUM_REQUESTS];
+ BeamSearchBatchConfig::BeamSearchPerRequestInfo
+ beamRequestsInfo[BeamSearchBatchConfig::MAX_NUM_REQUESTS];
+
+ TreeVerifyBatchConfig::CommittedTokensInfo
+ committed_tokens[TreeVerifyBatchConfig::MAX_NUM_TOKENS];
+};
struct FFHandler {
#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
@@ -70,6 +95,19 @@ struct FFHandler {
#endif
void *workSpace;
size_t workSpaceSize;
+ CombinedBatchConfigMetaStruct *batch_config_metadata;
+
+ // request info + token info + topolopgy mask info
+ size_t batch_config_metadata_size = sizeof(CombinedBatchConfigMetaStruct);
+ void *offload_reserve_space;
+ size_t offload_reserve_space_size;
+ // PEFT related fields
+ MemoryAllocator *peft_activation_allocator;
+ size_t peft_activation_reserve_space_size;
+ PEFTWeightAllocator *peft_weight_allocator;
+ size_t peft_weight_reserve_space_size;
+ // Quantization fields
+ DataType quantization_type;
bool allowTensorOpMathConversion;
#ifdef FF_USE_NCCL
ncclComm_t ncclComm;
@@ -78,6 +116,10 @@ struct FFHandler {
struct FFInitInfo {
size_t workSpaceSize;
+ size_t offload_reserve_space_size;
+ size_t peft_activation_reserve_space_size;
+ size_t peft_weight_reserve_space_size;
+ DataType quantization_type;
bool allowTensorOpMathConversion;
// int myRank, allRanks;
};
@@ -122,19 +164,32 @@ class FFConfig {
size_t workSpaceSize;
Legion::Context lg_ctx;
Legion::Runtime *lg_hlr;
- Legion::FieldSpace field_space;
- bool syntheticInput, profiling, perform_fusion;
+ Legion::IndexSpaceT<1> all_gpu_task_is;
+ // Legion::FieldSpace field_space;
+ bool benchmarking, profiling, perform_fusion;
+ bool inference_debugging;
size_t simulator_work_space_size;
size_t search_budget;
float search_alpha;
bool search_overlap_backward_update;
CompMode computationMode;
+ bool cpu_offload;
+ size_t offload_reserve_space_size;
+ DataType quantization_type;
+ // PEFT related fields
+ bool enable_peft;
+ size_t peft_activation_reserve_space_size;
+ size_t peft_weight_reserve_space_size;
// Control parallelizable dimensions
bool only_data_parallel;
bool enable_sample_parallel;
bool enable_parameter_parallel;
bool enable_attribute_parallel;
bool enable_inplace_optimizations;
+ // Control parallelism degrees in inference
+ int data_parallelism_degree;
+ int tensor_parallelism_degree;
+ int pipeline_parallelism_degree;
// Control Tensor Op Math Conversion
bool allow_tensor_op_math_conversion;
std::string dataset_path;
diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h
index 5658e2923d..24b722c36f 100644
--- a/include/flexflow/ffconst.h
+++ b/include/flexflow/ffconst.h
@@ -33,6 +33,8 @@ enum DataType {
DT_HALF = 43,
DT_FLOAT = 44,
DT_DOUBLE = 45,
+ DT_INT4 = 46,
+ DT_INT8 = 47,
DT_NONE = 49,
};
@@ -44,6 +46,12 @@ enum LossType {
LOSS_IDENTITY = 54,
};
+enum OptimizerType {
+ OPTIMIZER_TYPE_NONE = 60,
+ OPTIMIZER_TYPE_SGD = 61,
+ OPTIMIZER_TYPE_ADAM = 62,
+};
+
enum CompMode {
COMP_MODE_TRAINING = 70,
COMP_MODE_INFERENCE = 71,
@@ -64,6 +72,17 @@ enum MetricsType {
METRICS_MEAN_ABSOLUTE_ERROR = 1032,
};
+enum InferenceMode {
+ INC_DECODING_MODE = 2001,
+ BEAM_SEARCH_MODE = 2002,
+ TREE_VERIFY_MODE = 2003,
+};
+
+enum RequestType {
+ REQ_INFERENCE = 4001,
+ REQ_FINETUNING = 4002,
+};
+
// This is consistent with TASO's OpType
// https://github.com/jiazhihao/TASO/blob/master/include/taso/ops.h#L75-L138
enum OperatorType {
@@ -129,6 +148,7 @@ enum OperatorType {
OP_SHAPE, // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Shape
OP_SIZE, // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Size
OP_TOPK, // https://github.com/onnx/onnx/blob/master/docs/Operators.md#TopK
+ OP_ARG_TOPK,
OP_WHERE, // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Where
OP_CEIL, // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Ceil
OP_CAST, // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Cast
@@ -150,47 +170,74 @@ enum OperatorType {
OP_POW, // https://pytorch.org/docs/stable/generated/torch.pow.html
OP_MEAN, // https://pytorch.org/docs/stable/generated/torch.mean.html
OP_LAYERNORM,
+ OP_RESIDUAL_LAYERNORM,
+ OP_ADD_BIAS_RESIDUAL_LAYERNORM,
+ OP_SIGMOID_SILU_MULTI,
+ OP_EXPERTS,
OP_GATHER, // https://pytorch.org/docs/stable/generated/torch.gather.html
+ OP_RMS_NORM,
+ OP_RESIDUAL_RMS_NORM,
+ OP_BEAM_TOPK,
+ OP_ARGMAX,
+ OP_INC_MULTIHEAD_SELF_ATTENTION,
+ OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION,
+ OP_TREE_INC_MULTIHEAD_SELF_ATTENTION,
+ OP_SAMPLING,
+ // PEFT Ops
+ OP_LORA,
// Parallel Ops
OP_REPARTITION,
OP_COMBINE,
OP_REPLICATE,
OP_REDUCTION,
OP_PIPELINE,
+ OP_ALLREDUCE,
+ OP_PARALLEL_IDENTITY,
OP_FUSED_PARALLEL,
OP_INVALID,
};
+enum ModelType {
+ UNKNOWN = 3001,
+ LLAMA = 3002,
+ OPT = 3003,
+ FALCON = 3004,
+ STARCODER = 3005,
+ MPT = 3006
+};
+
enum PMParameter {
- PM_OP_TYPE, // AnyOp
- PM_NUM_INPUTS, // AnyOp
- PM_NUM_OUTPUTS, // AnyOp
- PM_GROUP, // Conv2D
- PM_KERNEL_H, // Conv2D, Pool2D
- PM_KERNEL_W, // Conv2D, Pool2D
- PM_STRIDE_H, // Conv2D, Pool2D
- PM_STRIDE_W, // Conv2D, Pool2D
- PM_PADDING_H, // Conv2D, Pool2D
- PM_PADDING_W, // Conv2D, Pool2D
- PM_ACTI, // Conv2D, Pool2D
- PM_NUMDIM, // Concat, Transpose
- PM_AXIS, // Concat, Split
- PM_PERM, // Transpose
- PM_OUTSHUFFLE, // Transpose
- PM_MERGE_GCONV_COUNT, // MergeGConv
- PM_AXES, // Squeeze, Unsqueeze, Reduce*
- PM_KEEP_DIMS, // Reduce*
- PM_EPSILON, // BatchNorm
- PM_REPARTITION_DIM, // Repartition
- PM_REPARTITION_DEGREE, // Repartition
- PM_REPLICATE_DIM, // Replicate
- PM_REPLICATE_DEGREE, // Replicate
- PM_COMBINE_DIM, // Combine
- PM_COMBINE_DEGREE, // Combine
- PM_REDUCTION_DIM, // Reduction
- PM_REDUCTION_DEGREE, // Reduction
- PM_SOFTMAX_DIM, // Softmax
- PM_NUM_HEADS, // MultiHeadAttention
+ PM_OP_TYPE, // AnyOp
+ PM_NUM_INPUTS, // AnyOp
+ PM_NUM_OUTPUTS, // AnyOp
+ PM_GROUP, // Conv2D
+ PM_KERNEL_H, // Conv2D, Pool2D
+ PM_KERNEL_W, // Conv2D, Pool2D
+ PM_STRIDE_H, // Conv2D, Pool2D
+ PM_STRIDE_W, // Conv2D, Pool2D
+ PM_PADDING_H, // Conv2D, Pool2D
+ PM_PADDING_W, // Conv2D, Pool2D
+ PM_ACTI, // Conv2D, Pool2D
+ PM_NUMDIM, // Concat, Transpose
+ PM_AXIS, // Concat, Split
+ PM_PERM, // Transpose
+ PM_OUTSHUFFLE, // Transpose
+ PM_MERGE_GCONV_COUNT, // MergeGConv
+ PM_AXES, // Squeeze, Unsqueeze, Reduce*
+ PM_KEEP_DIMS, // Reduce*
+ PM_EPSILON, // BatchNorm
+ PM_REPARTITION_DIM, // Repartition
+ PM_REPARTITION_DEGREE, // Repartition
+ PM_REPLICATE_DIM, // Replicate
+ PM_REPLICATE_DEGREE, // Replicate
+ PM_COMBINE_DIM, // Combine
+ PM_COMBINE_DEGREE, // Combine
+ PM_REDUCTION_DIM, // Reduction
+ PM_REDUCTION_DEGREE, // Reduction
+ PM_ALLREDUCE_DIM, // AllReduce
+ PM_PARALLEL_IDENTITY_DIM, // AllReduce
+ PM_SOFTMAX_DIM, // Softmax
+ PM_NUM_HEADS, // MultiHeadAttention
PM_INVALID,
PM_PARALLEL_DIM,
PM_PARALLEL_DEGREE,
@@ -236,5 +283,7 @@ enum {
TENSOR_GUID_LAST_VALID = 3999999,
PARALLEL_TENSOR_GUID_FIRST_VALID = 4000000,
NODE_GUID_FIRST_VALID = 5000000,
+ PEFT_MODEL_ID_FIRST_VALID = 6000000,
+ PEFT_MODEL_ID_LAST_VALID = 6999999
};
#endif // _FLEXFLOW_CONST_H_
diff --git a/include/flexflow/ffconst_utils.h b/include/flexflow/ffconst_utils.h
index fcd881e57e..421a139d57 100644
--- a/include/flexflow/ffconst_utils.h
+++ b/include/flexflow/ffconst_utils.h
@@ -8,8 +8,16 @@ namespace FlexFlow {
std::string get_operator_type_name(OperatorType type);
+size_t data_type_size(DataType type);
+
+#define INT4_NUM_OF_ELEMENTS_PER_GROUP 32
+
+size_t get_quantization_to_byte_size(DataType type,
+ DataType quantization_type,
+ size_t num_elements);
+
std::ostream &operator<<(std::ostream &, OperatorType);
}; // namespace FlexFlow
-#endif // _FLEXFLOW_FFCONST_UTILS_H
\ No newline at end of file
+#endif // _FLEXFLOW_FFCONST_UTILS_H
diff --git a/include/flexflow/fftype.h b/include/flexflow/fftype.h
index a71c85dbc8..3e482b8d67 100644
--- a/include/flexflow/fftype.h
+++ b/include/flexflow/fftype.h
@@ -3,20 +3,46 @@
#include "flexflow/ffconst.h"
#include
+#include
+#include
namespace FlexFlow {
class LayerID {
public:
+ static const LayerID NO_ID;
LayerID();
- LayerID(size_t id);
+ LayerID(size_t id, size_t transformer_layer_id, size_t model_id);
bool is_valid_id() const;
friend bool operator==(LayerID const &lhs, LayerID const &rhs);
+public:
+ size_t id, transformer_layer_id, model_id;
+};
+
+class PEFTModelID {
+public:
+ static const PEFTModelID NO_ID;
+ PEFTModelID();
+ PEFTModelID(size_t id);
+ bool is_valid_id() const;
+ friend bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs);
+ friend std::ostream &operator<<(std::ostream &os,
+ PEFTModelID const &peft_model_id);
+
public:
size_t id;
};
}; // namespace FlexFlow
-#endif // _FF_TYPE_H
\ No newline at end of file
+namespace std {
+template <>
+struct hash {
+ size_t operator()(FlexFlow::PEFTModelID const &n) const {
+ return n.id;
+ }
+};
+} // namespace std
+
+#endif // _FF_TYPE_H
diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index 16ce3ac205..52b4b3d362 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -47,6 +47,19 @@ FF_NEW_OPAQUE_TYPE(flexflow_dlrm_config_t);
FF_NEW_OPAQUE_TYPE(flexflow_dataloader_4d_t);
FF_NEW_OPAQUE_TYPE(flexflow_dataloader_2d_t);
FF_NEW_OPAQUE_TYPE(flexflow_single_dataloader_t);
+// Inference
+FF_NEW_OPAQUE_TYPE(flexflow_batch_config_t);
+FF_NEW_OPAQUE_TYPE(flexflow_tree_verify_batch_config_t);
+FF_NEW_OPAQUE_TYPE(flexflow_beam_search_batch_config_t);
+FF_NEW_OPAQUE_TYPE(flexflow_inference_manager_t);
+FF_NEW_OPAQUE_TYPE(flexflow_request_manager_t);
+FF_NEW_OPAQUE_TYPE(flexflow_file_data_loader_t);
+FF_NEW_OPAQUE_TYPE(flexflow_generation_result_t);
+// FF_NEW_OPAQUE_TYPE(flexflow_lora_optimizer_config_t);
+// FF_NEW_OPAQUE_TYPE(flexflow_lora_sgd_optimizer_config_t);
+// FF_NEW_OPAQUE_TYPE(flexflow_lora_adam_optimizer_config_t);
+FF_NEW_OPAQUE_TYPE(flexflow_lora_linear_config_t);
+FF_NEW_OPAQUE_TYPE(flexflow_peft_model_id_t);
// -----------------------------------------------------------------------
// FFConfig
@@ -72,12 +85,31 @@ int flexflow_config_get_epochs(flexflow_config_t handle);
bool flexflow_config_get_enable_control_replication(flexflow_config_t handle);
+int flexflow_config_get_data_parallelism_degree(flexflow_config_t handle_);
+
+int flexflow_config_get_tensor_parallelism_degree(flexflow_config_t handle_);
+
+int flexflow_config_get_pipeline_parallelism_degree(flexflow_config_t handle_);
+
+void flexflow_config_set_data_parallelism_degree(flexflow_config_t handle_,
+ int value);
+
+void flexflow_config_set_tensor_parallelism_degree(flexflow_config_t handle_,
+ int value);
+
+void flexflow_config_set_pipeline_parallelism_degree(flexflow_config_t handle_,
+ int value);
+
int flexflow_config_get_python_data_loader_type(flexflow_config_t handle);
+
+bool flexflow_config_get_offload(flexflow_config_t handle);
+
// -----------------------------------------------------------------------
// FFModel
// -----------------------------------------------------------------------
-flexflow_model_t flexflow_model_create(flexflow_config_t config);
+flexflow_model_t flexflow_model_create(flexflow_config_t config,
+ bool cpu_offload);
void flexflow_model_destroy(flexflow_model_t handle);
@@ -197,9 +229,10 @@ flexflow_tensor_t
flexflow_tensor_t
flexflow_model_add_embedding(flexflow_model_t handle,
const flexflow_tensor_t input,
- int num_entires,
+ int num_entries,
int out_dim,
enum AggrMode aggr,
+ enum DataType dtype,
flexflow_op_t shared_op,
flexflow_initializer_t kernel_initializer,
char const *name);
@@ -228,8 +261,41 @@ flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle,
int *axes,
bool elementwise_affine,
float eps,
+ bool use_bias,
char const *name);
+flexflow_tensor_t *
+ flexflow_model_add_residual_layer_norm(flexflow_model_t handle,
+ const flexflow_tensor_t input,
+ const flexflow_tensor_t residual1,
+ const flexflow_tensor_t residual2,
+ bool use_two_residuals,
+ int n,
+ int *axes,
+ bool elementwise_affine,
+ float eps,
+ bool use_bias,
+ bool inplace_residual,
+ char const *name);
+
+flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
+ flexflow_model_t handle,
+ const flexflow_tensor_t input,
+ const flexflow_tensor_t residual,
+ int n,
+ int *axes,
+ bool elementwise_affine,
+ float eps,
+ bool use_bias,
+ bool inplace_residual,
+ char const *name);
+
+flexflow_tensor_t
+ flexflow_model_add_sigmoid_silu_multi(flexflow_model_t handle,
+ const flexflow_tensor_t input1,
+ const flexflow_tensor_t input2,
+ char const *name);
+
flexflow_tensor_t
flexflow_model_add_batch_matmul(flexflow_model_t handle,
const flexflow_tensor_t a,
@@ -371,6 +437,170 @@ flexflow_tensor_t flexflow_model_add_multihead_attention(
flexflow_initializer_t kernel_initializer,
char const *name);
+flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention(
+ flexflow_model_t handle_,
+ const flexflow_tensor_t input_,
+ int embed_dim,
+ int num_heads,
+ int kdim,
+ int vdim,
+ float dropout,
+ bool bias,
+ bool add_bias_kv,
+ bool add_zero_attn,
+ enum DataType data_type,
+ flexflow_initializer_t kernel_initializer_,
+ bool apply_rotary_embedding,
+ bool scaling_query,
+ float scaling_factor,
+ bool qk_prod_scaling,
+ bool position_bias,
+ char const *name);
+
+flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
+ flexflow_model_t handle_,
+ const flexflow_tensor_t input_,
+ int embed_dim,
+ int num_heads,
+ int kdim,
+ int vdim,
+ float dropout,
+ bool bias,
+ bool add_bias_kv,
+ bool add_zero_attn,
+ enum DataType data_type,
+ flexflow_initializer_t kernel_initializer_,
+ bool apply_rotary_embedding,
+ bool scaling_query,
+ float scaling_factor,
+ bool qk_prod_scaling,
+ bool position_bias,
+ char const *name);
+
+flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
+ flexflow_model_t handle_,
+ const flexflow_tensor_t input_,
+ int embed_dim,
+ int num_heads,
+ int kdim,
+ int vdim,
+ float dropout,
+ bool bias,
+ bool add_bias_kv,
+ bool add_zero_attn,
+ enum DataType data_type,
+ flexflow_initializer_t kernel_initializer_,
+ bool apply_rotary_embedding,
+ bool scaling_query,
+ float scaling_factor,
+ bool qk_prod_scaling,
+ bool position_bias,
+ char const *name);
+
+flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention(
+ flexflow_model_t handle_,
+ const flexflow_tensor_t input_,
+ int embed_dim,
+ int num_q_heads,
+ int num_kv_heads,
+ int kdim,
+ int vdim,
+ float dropout,
+ bool bias,
+ bool add_bias_kv,
+ bool add_zero_attn,
+ enum DataType data_type,
+ flexflow_initializer_t kernel_initializer_,
+ bool apply_rotary_embedding,
+ bool scaling_query,
+ float scaling_factor,
+ bool qk_prod_scaling,
+ bool position_bias,
+ char const *name);
+
+flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
+ flexflow_model_t handle_,
+ const flexflow_tensor_t input_,
+ int embed_dim,
+ int num_q_heads,
+ int num_kv_heads,
+ int kdim,
+ int vdim,
+ float dropout,
+ bool bias,
+ bool add_bias_kv,
+ bool add_zero_attn,
+ enum DataType data_type,
+ flexflow_initializer_t kernel_initializer_,
+ bool apply_rotary_embedding,
+ bool scaling_query,
+ float scaling_factor,
+ bool qk_prod_scaling,
+ bool position_bias,
+ char const *name);
+
+flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify(
+ flexflow_model_t handle_,
+ const flexflow_tensor_t input_,
+ int embed_dim,
+ int num_q_heads,
+ int num_kv_heads,
+ int kdim,
+ int vdim,
+ float dropout,
+ bool bias,
+ bool add_bias_kv,
+ bool add_zero_attn,
+ enum DataType data_type,
+ flexflow_initializer_t kernel_initializer_,
+ bool apply_rotary_embedding,
+ bool scaling_query,
+ float scaling_factor,
+ bool qk_prod_scaling,
+ bool position_bias,
+ char const *name);
+
+flexflow_tensor_t flexflow_model_add_rms_norm(flexflow_model_t handle_,
+ const flexflow_tensor_t input_,
+ float eps,
+ int dim,
+ char const *name);
+
+flexflow_tensor_t *
+ flexflow_model_add_residual_rms_norm(flexflow_model_t handle_,
+ const flexflow_tensor_t input1_,
+ const flexflow_tensor_t input2_,
+ float eps,
+ int dim,
+ bool inplace_residual,
+ char const *name);
+
+flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_,
+ const flexflow_tensor_t input_,
+ int k,
+ bool sorted,
+ bool speculative_decoding,
+ char const *name);
+
+flexflow_tensor_t flexflow_model_add_beam_top_k(flexflow_model_t handle_,
+ const flexflow_tensor_t input_,
+ int max_beam_size,
+ bool sorted,
+ char const *name);
+
+flexflow_tensor_t flexflow_model_add_sampling(flexflow_model_t handle_,
+ const flexflow_tensor_t input_,
+ float top_p,
+ char const *name);
+
+flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_,
+ const flexflow_tensor_t input_,
+ bool beam_search,
+ char const *name);
+
+flexflow_peft_model_id_t flexflow_model_add_lora_layer(
+ flexflow_model_t handle_, const flexflow_lora_linear_config_t peft_config_);
+
void flexflow_model_set_sgd_optimizer(flexflow_model_t handle,
flexflow_sgd_optimizer_t optimizer);
@@ -390,6 +620,23 @@ flexflow_tensor_t flexflow_model_get_parameter_by_id(flexflow_model_t handle,
flexflow_perf_metrics_t
flexflow_model_get_perf_metrics(flexflow_model_t handle);
+void flexflow_model_set_transformer_layer_id(flexflow_model_t handle, int id);
+
+void flexflow_model_generate(flexflow_model_t handle_,
+ int num_requests,
+ enum RequestType *request_types,
+ char const **input_texts,
+ char **output_texts,
+ int *max_seq_lengths,
+ flexflow_peft_model_id_t *peft_model_ids,
+ char const **dataset_filepaths,
+ int *training_steps,
+ int **output_length_and_tokens,
+ int *num_finetuning_losses,
+ float *finetuning_losses);
+
+void flexflow_model_set_position_offset(flexflow_model_t handle, int offset);
+
// -----------------------------------------------------------------------
// Tensor
// -----------------------------------------------------------------------
@@ -699,6 +946,222 @@ void flexflow_op_forward(flexflow_op_t handle, flexflow_model_t model);
void flexflow_perform_registration(void);
+// -----------------------------------------------------------------------
+// BatchConfig
+// -----------------------------------------------------------------------
+
+flexflow_batch_config_t flexflow_batch_config_create(void);
+
+void flexflow_batch_config_destroy(flexflow_batch_config_t handle);
+
+// -----------------------------------------------------------------------
+// TreeVerifyBatchConfig
+// -----------------------------------------------------------------------
+
+flexflow_tree_verify_batch_config_t
+ flexflow_tree_verify_batch_config_create(void);
+
+void flexflow_tree_verify_batch_config_destroy(
+ flexflow_tree_verify_batch_config_t handle);
+
+// -----------------------------------------------------------------------
+// BeamSearchBatchConfig
+// -----------------------------------------------------------------------
+
+flexflow_beam_search_batch_config_t
+ flexflow_beam_search_batch_config_create(void);
+
+void flexflow_beam_search_batch_config_destroy(
+ flexflow_beam_search_batch_config_t handle);
+
+// -----------------------------------------------------------------------
+// RequestManager
+// -----------------------------------------------------------------------
+
+flexflow_request_manager_t flexflow_request_manager_get_request_manager(void);
+
+// void flexflow_request_manager_destroy(flexflow_request_manager_t handle_);
+
+void flexflow_request_manager_set_max_requests_per_batch(
+ flexflow_request_manager_t handle_, int max_num_requests);
+
+void flexflow_request_manager_set_max_tokens_per_batch(
+ flexflow_request_manager_t handle_, int max_num_tokens);
+
+void flexflow_request_manager_set_max_spec_tree_token_num(
+ flexflow_request_manager_t handle_, int max_num_tokens);
+
+void flexflow_request_manager_set_max_sequence_length(
+ flexflow_request_manager_t handle_, int max_seq_length);
+
+void flexflow_request_manager_set_enable_peft_finetuning(
+ flexflow_request_manager_t handle_, bool enable_peft_finetuning_);
+
+void flexflow_request_manager_register_tokenizer(
+ flexflow_request_manager_t handle_,
+ enum ModelType model_type,
+ int bos_token_id,
+ int eos_token_id,
+ char const *tokenizer_filepath);
+
+void flexflow_request_manager_register_output_filepath(
+ flexflow_request_manager_t handle_, char const *output_filepath);
+
+int flexflow_request_manager_register_ssm_model(
+ flexflow_request_manager_t handle_, flexflow_model_t model_handle_);
+
+void flexflow_request_manager_start_background_server(
+ flexflow_request_manager_t handle_, flexflow_model_t model_handle_);
+
+void flexflow_request_manager_terminate_background_server(
+ flexflow_request_manager_t handle_);
+
+// -----------------------------------------------------------------------
+// InferenceManager
+// -----------------------------------------------------------------------
+
+flexflow_inference_manager_t
+ flexflow_inference_manager_get_inference_manager(void);
+
+// void flexflow_inference_manager_destroy(flexflow_inference_manager_t
+// handle_);
+
+void flexflow_inference_manager_compile_model_and_allocate_buffer(
+ flexflow_inference_manager_t handle_, flexflow_model_t model_handle);
+
+void flexflow_inference_manager_init_operators_inference(
+ flexflow_inference_manager_t handle_, flexflow_model_t model_handle);
+
+void flexflow_inference_manager_register_model_weights_loader(
+ flexflow_inference_manager_t handle_,
+ flexflow_model_t model_handle,
+ flexflow_file_data_loader_t loader_handle);
+
+// -----------------------------------------------------------------------
+// FileDataLoader
+// -----------------------------------------------------------------------
+
+flexflow_file_data_loader_t
+ flexflow_file_data_loader_create(char const *weight_file_path,
+ int num_q_heads,
+ int num_kv_heads,
+ int hidden_dim,
+ int qkv_inner_dim,
+ int tensor_parallelism_degree,
+ bool use_full_precision);
+
+void flexflow_file_data_loader_destroy(flexflow_file_data_loader_t handle_);
+
+void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_,
+ flexflow_model_t model_handle_);
+
+// // -----------------------------------------------------------------------
+// // LoraSGDOptimizerConfig
+// // -----------------------------------------------------------------------
+
+// flexflow_lora_sgd_optimizer_config_t
+// flexflow_lora_sgd_optimizer_config_create(
+// double lr, double momentum, bool nesterov, bool weight_decay);
+
+// void flexflow_lora_sgd_optimizer_config_destroy(
+// flexflow_lora_sgd_optimizer_config_t handle_);
+
+// // -----------------------------------------------------------------------
+// // LoraAdamOptimizerConfig
+// // -----------------------------------------------------------------------
+
+// flexflow_lora_adam_optimizer_config_t
+// flexflow_lora_adam_optimizer_config_create(double alpha,
+// double beta1,
+// double beta2,
+// double weight_decay,
+// double epsilon);
+
+// void flexflow_lora_adam_optimizer_config_destroy(
+// flexflow_lora_adam_optimizer_config_t handle_);
+
+// -----------------------------------------------------------------------
+// LoraLinearConfig
+// -----------------------------------------------------------------------
+
+flexflow_lora_linear_config_t
+ flexflow_lora_linear_config_create(char const *cache_folder_,
+ char const *peft_model_id_,
+ bool trainable,
+ bool init_lora_weights,
+ char const *base_model_name_or_path,
+ char const *precision,
+ int rank,
+ float lora_alpha,
+ float lora_dropout,
+ int num_target_modules,
+ char const **target_modules_,
+ enum OptimizerType optimizer_type,
+ float sgd_learning_rate,
+ float sgd_momentum,
+ bool sgd_nesterov,
+ float sgd_weight_decay,
+ float adam_alpha,
+ float adam_beta1,
+ float adam_beta2,
+ float adam_weight_decay,
+ float adam_epsilon);
+
+void flexflow_lora_linear_config_destroy(flexflow_lora_linear_config_t handle_);
+
+char const *flexflow_lora_linear_config_get_cache_folder(
+ flexflow_lora_linear_config_t handle_);
+
+char const *flexflow_lora_linear_config_get_peft_model_id(
+ flexflow_lora_linear_config_t handle_);
+
+int flexflow_lora_linear_config_get_rank(flexflow_lora_linear_config_t handle_);
+
+float flexflow_lora_linear_config_get_lora_alpha(
+ flexflow_lora_linear_config_t handle_);
+
+float flexflow_lora_linear_config_get_lora_dropout(
+ flexflow_lora_linear_config_t handle_);
+
+bool flexflow_lora_linear_config_get_trainable(
+ flexflow_lora_linear_config_t handle_);
+
+bool flexflow_lora_linear_config_get_init_lora_weights(
+ flexflow_lora_linear_config_t handle_);
+
+char const **flexflow_lora_linear_config_get_target_modules(
+ flexflow_lora_linear_config_t handle_, int *num_target_modules);
+
+char const *flexflow_lora_linear_config_get_base_model_name_or_path(
+ flexflow_lora_linear_config_t handle_);
+
+char const *flexflow_lora_linear_config_get_precision(
+ flexflow_lora_linear_config_t handle_);
+
+void flexflow_lora_linear_config_set_lora_alpha(
+ flexflow_lora_linear_config_t handle_, float value);
+
+void flexflow_lora_linear_config_set_lora_dropout(
+ flexflow_lora_linear_config_t handle_, float value);
+
+void flexflow_lora_linear_config_set_trainable(
+ flexflow_lora_linear_config_t handle_, bool value);
+
+void flexflow_lora_linear_config_set_init_lora_weights(
+ flexflow_lora_linear_config_t handle_, bool value);
+
+// -----------------------------------------------------------------------
+// PEFTModelID
+// -----------------------------------------------------------------------
+
+flexflow_peft_model_id_t flexflow_peft_model_id_create();
+
+flexflow_peft_model_id_t flexflow_peft_model_id_create_id(unsigned long id);
+
+flexflow_peft_model_id_t flexflow_peft_model_id_no_id();
+
+void flexflow_peft_model_id_destroy(flexflow_peft_model_id_t handle_);
+
#ifdef __cplusplus
}
#endif
diff --git a/include/flexflow/gpt_tokenizer.h b/include/flexflow/gpt_tokenizer.h
new file mode 100644
index 0000000000..ec08435809
--- /dev/null
+++ b/include/flexflow/gpt_tokenizer.h
@@ -0,0 +1,221 @@
+// version 0.1
+// Licensed under the MIT License .
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2019-2020 zili wang .
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+using json = nlohmann::json;
+
+typedef std::pair bigram_pair;
+typedef std::pair wbigram_pair;
+
+struct hash_pair {
+ template
+ size_t operator()(std::pair const &p) const {
+ auto hash1 = std::hash{}(p.first);
+ auto hash2 = std::hash{}(p.second);
+ return hash1 ^ hash2;
+ }
+};
+
+enum tokenizer_mode { GPT2_TOKENIZER, OPT_TOKENIZER };
+
+class GPT_Tokenizer {
+
+public:
+ GPT_Tokenizer(tokenizer_mode mode_,
+ std::string const &vocab_file,
+ std::string const &merge_file,
+ std::string const &bos_token_str = "",
+ const std::string eos_token_str = "",
+ const std::string pad_token_str = "",
+ const std::string unk_token_str = "",
+ const std::string mask_token_str = "") {
+ mode = mode_;
+ load_vocab(vocab_file);
+ load_merge(merge_file);
+ bos_token = bos_token_str;
+ eos_token = eos_token_str;
+ pad_token = pad_token_str;
+ unk_token = unk_token_str;
+ mask_token = mask_token_str;
+ bytes_encoder = bytes_to_unicode();
+ unicode_to_bytes();
+ };
+ // ~GPT_Tokenizer();
+ std::vector bpe(std::wstring token);
+ std::vector tokenize(std::string str);
+ int32_t convert_token_to_id(std::string token);
+ void encode(std::string str,
+ size_t max_length,
+ std::vector *input_ids,
+ std::vector *mask_ids);
+ std::string decode(std::vector input_ids,
+ std::vector mask_ids);
+ tokenizer_mode mode;
+ std::string bos_token;
+ std::string eos_token;
+ std::string pad_token;
+ std::string unk_token;
+ std::string mask_token;
+ std::string strip(std::string const &inpt);
+
+private:
+ std::unordered_map vocab;
+ std::unordered_map inverse_vocab;
+ std::unordered_map bpe_ranks;
+ wchar_t *bytes_to_unicode();
+ void unicode_to_bytes();
+ wchar_t *bytes_encoder;
+ std::unordered_map bytes_decoder;
+ uint32_t cache_max_size = 500000;
+ uint32_t cache_word_max_length = 30;
+ std::string unicode_letter_expr =
+ "\\u0041-\\u005A\\u0061-\\u007A\\u00AA-\\u00AA\\u00B5-\\u00B5"
+ "\\u00BA-\\u00BA\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u02C1"
+ "\\u02C6-\\u02D1\\u02E0-\\u02E4\\u02EC-\\u02EC\\u02EE-\\u02EE"
+ "\\u0370-\\u0374\\u0376-\\u0377\\u037A-\\u037D\\u037F-\\u037F"
+ "\\u0386-\\u0386\\u0388-\\u038A\\u038C-\\u038C\\u038E-\\u03A1"
+ "\\u03A3-\\u03F5\\u03F7-\\u0481\\u048A-\\u052F\\u0531-\\u0556"
+ "\\u0559-\\u0559\\u0560-\\u0588\\u05D0-\\u05EA\\u05EF-\\u05F2"
+ "\\u0620-\\u064A\\u066E-\\u066F\\u0671-\\u06D3\\u06D5-\\u06D5"
+ "\\u06E5-\\u06E6\\u06EE-\\u06EF\\u06FA-\\u06FC\\u06FF-\\u06FF"
+ "\\u0710-\\u0710\\u0712-\\u072F\\u074D-\\u07A5\\u07B1-\\u07B1"
+ "\\u07CA-\\u07EA\\u07F4-\\u07F5\\u07FA-\\u07FA\\u0800-\\u0815"
+ "\\u081A-\\u081A\\u0824-\\u0824\\u0828-\\u0828\\u0840-\\u0858"
+ "\\u0860-\\u086A\\u08A0-\\u08B4\\u08B6-\\u08C7\\u0904-\\u0939"
+ "\\u093D-\\u093D\\u0950-\\u0950\\u0958-\\u0961\\u0971-\\u0980"
+ "\\u0985-\\u098C\\u098F-\\u0990\\u0993-\\u09A8\\u09AA-\\u09B0"
+ "\\u09B2-\\u09B2\\u09B6-\\u09B9\\u09BD-\\u09BD\\u09CE-\\u09CE"
+ "\\u09DC-\\u09DD\\u09DF-\\u09E1\\u09F0-\\u09F1\\u09FC-\\u09FC"
+ "\\u0A05-\\u0A0A\\u0A0F-\\u0A10\\u0A13-\\u0A28\\u0A2A-\\u0A30"
+ "\\u0A32-\\u0A33\\u0A35-\\u0A36\\u0A38-\\u0A39\\u0A59-\\u0A5C"
+ "\\u0A5E-\\u0A5E\\u0A72-\\u0A74\\u0A85-\\u0A8D\\u0A8F-\\u0A91"
+ "\\u0A93-\\u0AA8\\u0AAA-\\u0AB0\\u0AB2-\\u0AB3\\u0AB5-\\u0AB9"
+ "\\u0ABD-\\u0ABD\\u0AD0-\\u0AD0\\u0AE0-\\u0AE1\\u0AF9-\\u0AF9"
+ "\\u0B05-\\u0B0C\\u0B0F-\\u0B10\\u0B13-\\u0B28\\u0B2A-\\u0B30"
+ "\\u0B32-\\u0B33\\u0B35-\\u0B39\\u0B3D-\\u0B3D\\u0B5C-\\u0B5D"
+ "\\u0B5F-\\u0B61\\u0B71-\\u0B71\\u0B83-\\u0B83\\u0B85-\\u0B8A"
+ "\\u0B8E-\\u0B90\\u0B92-\\u0B95\\u0B99-\\u0B9A\\u0B9C-\\u0B9C"
+ "\\u0B9E-\\u0B9F\\u0BA3-\\u0BA4\\u0BA8-\\u0BAA\\u0BAE-\\u0BB9"
+ "\\u0BD0-\\u0BD0\\u0C05-\\u0C0C\\u0C0E-\\u0C10\\u0C12-\\u0C28"
+ "\\u0C2A-\\u0C39\\u0C3D-\\u0C3D\\u0C58-\\u0C5A\\u0C60-\\u0C61"
+ "\\u0C80-\\u0C80\\u0C85-\\u0C8C\\u0C8E-\\u0C90\\u0C92-\\u0CA8"
+ "\\u0CAA-\\u0CB3\\u0CB5-\\u0CB9\\u0CBD-\\u0CBD\\u0CDE-\\u0CDE"
+ "\\u0CE0-\\u0CE1\\u0CF1-\\u0CF2\\u0D04-\\u0D0C\\u0D0E-\\u0D10"
+ "\\u0D12-\\u0D3A\\u0D3D-\\u0D3D\\u0D4E-\\u0D4E\\u0D54-\\u0D56"
+ "\\u0D5F-\\u0D61\\u0D7A-\\u0D7F\\u0D85-\\u0D96\\u0D9A-\\u0DB1"
+ "\\u0DB3-\\u0DBB\\u0DBD-\\u0DBD\\u0DC0-\\u0DC6\\u0E01-\\u0E30"
+ "\\u0E32-\\u0E33\\u0E40-\\u0E46\\u0E81-\\u0E82\\u0E84-\\u0E84"
+ "\\u0E86-\\u0E8A\\u0E8C-\\u0EA3\\u0EA5-\\u0EA5\\u0EA7-\\u0EB0"
+ "\\u0EB2-\\u0EB3\\u0EBD-\\u0EBD\\u0EC0-\\u0EC4\\u0EC6-\\u0EC6"
+ "\\u0EDC-\\u0EDF\\u0F00-\\u0F00\\u0F40-\\u0F47\\u0F49-\\u0F6C"
+ "\\u0F88-\\u0F8C\\u1000-\\u102A\\u103F-\\u103F\\u1050-\\u1055"
+ "\\u105A-\\u105D\\u1061-\\u1061\\u1065-\\u1066\\u106E-\\u1070"
+ "\\u1075-\\u1081\\u108E-\\u108E\\u10A0-\\u10C5\\u10C7-\\u10C7"
+ "\\u10CD-\\u10CD\\u10D0-\\u10FA\\u10FC-\\u1248\\u124A-\\u124D"
+ "\\u1250-\\u1256\\u1258-\\u1258\\u125A-\\u125D\\u1260-\\u1288"
+ "\\u128A-\\u128D\\u1290-\\u12B0\\u12B2-\\u12B5\\u12B8-\\u12BE"
+ "\\u12C0-\\u12C0\\u12C2-\\u12C5\\u12C8-\\u12D6\\u12D8-\\u1310"
+ "\\u1312-\\u1315\\u1318-\\u135A\\u1380-\\u138F\\u13A0-\\u13F5"
+ "\\u13F8-\\u13FD\\u1401-\\u166C\\u166F-\\u167F\\u1681-\\u169A"
+ "\\u16A0-\\u16EA\\u16F1-\\u16F8\\u1700-\\u170C\\u170E-\\u1711"
+ "\\u1720-\\u1731\\u1740-\\u1751\\u1760-\\u176C\\u176E-\\u1770"
+ "\\u1780-\\u17B3\\u17D7-\\u17D7\\u17DC-\\u17DC\\u1820-\\u1878"
+ "\\u1880-\\u1884\\u1887-\\u18A8\\u18AA-\\u18AA\\u18B0-\\u18F5"
+ "\\u1900-\\u191E\\u1950-\\u196D\\u1970-\\u1974\\u1980-\\u19AB"
+ "\\u19B0-\\u19C9\\u1A00-\\u1A16\\u1A20-\\u1A54\\u1AA7-\\u1AA7"
+ "\\u1B05-\\u1B33\\u1B45-\\u1B4B\\u1B83-\\u1BA0\\u1BAE-\\u1BAF"
+ "\\u1BBA-\\u1BE5\\u1C00-\\u1C23\\u1C4D-\\u1C4F\\u1C5A-\\u1C7D"
+ "\\u1C80-\\u1C88\\u1C90-\\u1CBA\\u1CBD-\\u1CBF\\u1CE9-\\u1CEC"
+ "\\u1CEE-\\u1CF3\\u1CF5-\\u1CF6\\u1CFA-\\u1CFA\\u1D00-\\u1DBF"
+ "\\u1E00-\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D"
+ "\\u1F50-\\u1F57\\u1F59-\\u1F59\\u1F5B-\\u1F5B\\u1F5D-\\u1F5D"
+ "\\u1F5F-\\u1F7D\\u1F80-\\u1FB4\\u1FB6-\\u1FBC\\u1FBE-\\u1FBE"
+ "\\u1FC2-\\u1FC4\\u1FC6-\\u1FCC\\u1FD0-\\u1FD3\\u1FD6-\\u1FDB"
+ "\\u1FE0-\\u1FEC\\u1FF2-\\u1FF4\\u1FF6-\\u1FFC\\u2071-\\u2071"
+ "\\u207F-\\u207F\\u2090-\\u209C\\u2102-\\u2102\\u2107-\\u2107"
+ "\\u210A-\\u2113\\u2115-\\u2115\\u2119-\\u211D\\u2124-\\u2124"
+ "\\u2126-\\u2126\\u2128-\\u2128\\u212A-\\u212D\\u212F-\\u2139"
+ "\\u213C-\\u213F\\u2145-\\u2149\\u214E-\\u214E\\u2183-\\u2184"
+ "\\u2C00-\\u2C2E\\u2C30-\\u2C5E\\u2C60-\\u2CE4\\u2CEB-\\u2CEE"
+ "\\u2CF2-\\u2CF3\\u2D00-\\u2D25\\u2D27-\\u2D27\\u2D2D-\\u2D2D"
+ "\\u2D30-\\u2D67\\u2D6F-\\u2D6F\\u2D80-\\u2D96\\u2DA0-\\u2DA6"
+ "\\u2DA8-\\u2DAE\\u2DB0-\\u2DB6\\u2DB8-\\u2DBE\\u2DC0-\\u2DC6"
+ "\\u2DC8-\\u2DCE\\u2DD0-\\u2DD6\\u2DD8-\\u2DDE\\u2E2F-\\u2E2F"
+ "\\u3005-\\u3006\\u3031-\\u3035\\u303B-\\u303C\\u3041-\\u3096"
+ "\\u309D-\\u309F\\u30A1-\\u30FA\\u30FC-\\u30FF\\u3105-\\u312F"
+ "\\u3131-\\u318E\\u31A0-\\u31BF\\u31F0-\\u31FF\\u3400-\\u4DBF"
+ "\\u4E00-\\u9FFC\\uA000-\\uA48C\\uA4D0-\\uA4FD\\uA500-\\uA60C"
+ "\\uA610-\\uA61F\\uA62A-\\uA62B\\uA640-\\uA66E\\uA67F-\\uA69D"
+ "\\uA6A0-\\uA6E5\\uA717-\\uA71F\\uA722-\\uA788\\uA78B-\\uA7BF"
+ "\\uA7C2-\\uA7CA\\uA7F5-\\uA801\\uA803-\\uA805\\uA807-\\uA80A"
+ "\\uA80C-\\uA822\\uA840-\\uA873\\uA882-\\uA8B3\\uA8F2-\\uA8F7"
+ "\\uA8FB-\\uA8FB\\uA8FD-\\uA8FE\\uA90A-\\uA925\\uA930-\\uA946"
+ "\\uA960-\\uA97C\\uA984-\\uA9B2\\uA9CF-\\uA9CF\\uA9E0-\\uA9E4"
+ "\\uA9E6-\\uA9EF\\uA9FA-\\uA9FE\\uAA00-\\uAA28\\uAA40-\\uAA42"
+ "\\uAA44-\\uAA4B\\uAA60-\\uAA76\\uAA7A-\\uAA7A\\uAA7E-\\uAAAF"
+ "\\uAAB1-\\uAAB1\\uAAB5-\\uAAB6\\uAAB9-\\uAABD\\uAAC0-\\uAAC0"
+ "\\uAAC2-\\uAAC2\\uAADB-\\uAADD\\uAAE0-\\uAAEA\\uAAF2-\\uAAF4"
+ "\\uAB01-\\uAB06\\uAB09-\\uAB0E\\uAB11-\\uAB16\\uAB20-\\uAB26"
+ "\\uAB28-\\uAB2E\\uAB30-\\uAB5A\\uAB5C-\\uAB69\\uAB70-\\uABE2"
+ "\\uAC00-\\uD7A3\\uD7B0-\\uD7C6\\uD7CB-\\uD7FB\\uF900-\\uFA6D"
+ "\\uFA70-\\uFAD9\\uFB00-\\uFB06\\uFB13-\\uFB17\\uFB1D-\\uFB1D"
+ "\\uFB1F-\\uFB28\\uFB2A-\\uFB36\\uFB38-\\uFB3C\\uFB3E-\\uFB3E"
+ "\\uFB40-\\uFB41\\uFB43-\\uFB44\\uFB46-\\uFBB1\\uFBD3-\\uFD3D"
+ "\\uFD50-\\uFD8F\\uFD92-\\uFDC7\\uFDF0-\\uFDFB\\uFE70-\\uFE74"
+ "\\uFE76-\\uFEFC\\uFF21-\\uFF3A\\uFF41-\\uFF5A\\uFF66-\\uFFBE"
+ "\\uFFC2-\\uFFC7\\uFFCA-\\uFFCF\\uFFD2-\\uFFD7\\uFFDA-\\uFFDC";
+
+ std::string unicode_number_expr =
+ "\\u0030-\\u0039\\u00B2-\\u00B3\\u00B9-\\u00B9\\u00BC-\\u00BE"
+ "\\u0660-\\u0669\\u06F0-\\u06F9\\u07C0-\\u07C9\\u0966-\\u096F"
+ "\\u09E6-\\u09EF\\u09F4-\\u09F9\\u0A66-\\u0A6F\\u0AE6-\\u0AEF"
+ "\\u0B66-\\u0B6F\\u0B72-\\u0B77\\u0BE6-\\u0BF2\\u0C66-\\u0C6F"
+ "\\u0C78-\\u0C7E\\u0CE6-\\u0CEF\\u0D58-\\u0D5E\\u0D66-\\u0D78"
+ "\\u0DE6-\\u0DEF\\u0E50-\\u0E59\\u0ED0-\\u0ED9\\u0F20-\\u0F33"
+ "\\u1040-\\u1049\\u1090-\\u1099\\u1369-\\u137C\\u16EE-\\u16F0"
+ "\\u17E0-\\u17E9\\u17F0-\\u17F9\\u1810-\\u1819\\u1946-\\u194F"
+ "\\u19D0-\\u19DA\\u1A80-\\u1A89\\u1A90-\\u1A99\\u1B50-\\u1B59"
+ "\\u1BB0-\\u1BB9\\u1C40-\\u1C49\\u1C50-\\u1C59\\u2070-\\u2070"
+ "\\u2074-\\u2079\\u2080-\\u2089\\u2150-\\u2182\\u2185-\\u2189"
+ "\\u2460-\\u249B\\u24EA-\\u24FF\\u2776-\\u2793\\u2CFD-\\u2CFD"
+ "\\u3007-\\u3007\\u3021-\\u3029\\u3038-\\u303A\\u3192-\\u3195"
+ "\\u3220-\\u3229\\u3248-\\u324F\\u3251-\\u325F\\u3280-\\u3289"
+ "\\u32B1-\\u32BF\\uA620-\\uA629\\uA6E6-\\uA6EF\\uA830-\\uA835"
+ "\\uA8D0-\\uA8D9\\uA900-\\uA909\\uA9D0-\\uA9D9\\uA9F0-\\uA9F9"
+ "\\uAA50-\\uAA59\\uABF0-\\uABF9\\uFF10-\\uFF19";
+
+ std::wstring wpat_expr = utf8_to_wstring(
+ "'s|'t|'re|'ve|'m|'ll|'d| ?[" + unicode_letter_expr + "]+| ?[" +
+ unicode_number_expr + "]+| ?[^\\s" + unicode_letter_expr +
+ unicode_number_expr + "]+|\\s+(?!\\S)|\\s+");
+
+ const std::wregex pat = std::wregex(wpat_expr);
+ std::unordered_map> cache;
+ void load_vocab(std::string const &vocab_file);
+ void load_merge(std::string const &merge_file);
+
+ std::unordered_set
+ get_pairs(std::vector word);
+ std::wstring utf8_to_wstring(std::string const &src);
+ std::u32string utf8_to_utf32(std::string const &src);
+ std::string wstring_to_utf8(std::wstring const &src);
+ std::string utf32_to_utf8(std::u32string const &src);
+
+ std::vector split(std::string const &s,
+ std::regex rgx = std::regex("\\s+"));
+};
diff --git a/include/flexflow/graph.h b/include/flexflow/graph.h
index 2e0cf1ca4b..9dc6572593 100644
--- a/include/flexflow/graph.h
+++ b/include/flexflow/graph.h
@@ -24,7 +24,7 @@
#include "legion/legion_utilities.h"
#include
-extern LegionRuntime::Logger::Category log_dp;
+extern Legion::Logger log_dp;
namespace FlexFlow::PCG {
diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h
new file mode 100644
index 0000000000..ba4101c173
--- /dev/null
+++ b/include/flexflow/inference.h
@@ -0,0 +1,51 @@
+/* Copyright 2022 CMU, Stanford, Facebook, LANL
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "flexflow/batch_config.h"
+#include
+#include
+
+namespace FlexFlow {
+
+struct GenerationConfig {
+ bool do_sample = false;
+ float temperature = 0.8;
+ float topp = 0.6;
+ GenerationConfig(bool _do_sample, float _temperature, float _topp) {
+ temperature = _temperature > 0 ? _temperature : temperature;
+ topp = _topp > 0 ? _topp : topp;
+ do_sample = _do_sample;
+ }
+ GenerationConfig() {}
+};
+
+struct GenerationResult {
+ using RequestGuid = BatchConfig::RequestGuid;
+ using TokenId = BatchConfig::TokenId;
+ RequestGuid guid;
+ std::string input_text;
+ std::string output_text;
+ std::vector input_tokens;
+ std::vector output_tokens;
+ std::vector finetuning_losses;
+};
+
+#include
+#include
+
+std::string join_path(std::vector const &paths);
+
+} // namespace FlexFlow
diff --git a/include/flexflow/layer.h b/include/flexflow/layer.h
index 0c1d7a6092..c3dbcac422 100644
--- a/include/flexflow/layer.h
+++ b/include/flexflow/layer.h
@@ -49,9 +49,10 @@ class Layer {
Tensor outputs[MAX_NUM_OUTPUTS];
Tensor inputs[MAX_NUM_INPUTS];
Tensor weights[MAX_NUM_WEIGHTS];
- bool trainableInputs[MAX_NUM_INPUTS];
+ // bool trainable_inputs[MAX_NUM_INPUTS];
int numInputs, numWeights, numOutputs;
bool profiling;
+ bool inference_debugging;
private:
std::unordered_map int_properties;
diff --git a/include/flexflow/machine_view.h b/include/flexflow/machine_view.h
index 8843dc4d6a..807b0c9c0d 100644
--- a/include/flexflow/machine_view.h
+++ b/include/flexflow/machine_view.h
@@ -4,7 +4,11 @@
#include "legion.h"
#include
#ifdef FF_USE_NCCL
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
#include
+#else
+#include
+#endif
#endif
#include "flexflow/config.h"
diff --git a/include/flexflow/mapper.h b/include/flexflow/mapper.h
index 71be1892aa..e8337818ec 100644
--- a/include/flexflow/mapper.h
+++ b/include/flexflow/mapper.h
@@ -83,11 +83,10 @@ class FFMapper : public NullMapper {
Task const &task,
MapTaskInput const &input,
MapTaskOutput &output);
- virtual void map_replicate_task(const MapperContext ctx,
- Task const &task,
- MapTaskInput const &input,
- MapTaskOutput const &default_output,
- MapReplicateTaskOutput &output);
+ virtual void replicate_task(const MapperContext ctx,
+ Task const &task,
+ ReplicateTaskInput const &input,
+ ReplicateTaskOutput &output);
virtual void select_task_variant(const MapperContext ctx,
Task const &task,
SelectVariantInput const &input,
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index cb1b26d624..4ad735ef7d 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -17,10 +17,12 @@
#include "accessor.h"
#include "config.h"
#include "device.h"
+#include "flexflow/inference.h"
#include "flexflow/memory_optimization.h"
#include "flexflow/node.h"
#include "flexflow/operator_params.h"
#include "flexflow/utils/hash_utils.h"
+#include "flexflow/utils/memory_allocator.h"
#include "flexflow/utils/tuple.h"
#include "initializer.h"
#include "layer.h"
@@ -30,6 +32,7 @@
#include "optimizer.h"
#include "parallel_tensor.h"
#include "recompile.h"
+#include "runtime.h"
#include "simulator.h"
#include "tensor.h"
#include "tl/optional.hpp"
@@ -50,11 +53,17 @@ enum TaskIDs {
LOAD_IMAGES_TASK_ID,
NORMALIZE_IMAGES_TASK_ID,
ELEMENTBINARY_INIT_TASK_ID,
+ ELEMENTBINARY_INF_TASK_ID,
ELEMENTBINARY_FWD_TASK_ID,
ELEMENTBINARY_BWD_TASK_ID,
ELEMENTUNARY_INIT_TASK_ID,
ELEMENTUNARY_FWD_TASK_ID,
+ ELEMENTUNARY_INF_TASK_ID,
ELEMENTUNARY_BWD_TASK_ID,
+ EXPERTS_INIT_TASK_ID,
+ EXPERTS_FWD_TASK_ID,
+ EXPERTS_BWD_TASK_ID,
+ EXPERTS_INF_TASK_ID,
CONV2D_INIT_TASK_ID,
CONV2D_INIT_PARA_TASK_ID,
CONV2D_FWD_TASK_ID,
@@ -65,6 +74,7 @@ enum TaskIDs {
DROPOUT_BWD_TASK_ID,
EMBED_INIT_TASK_ID,
EMBED_FWD_TASK_ID,
+ EMBED_INF_TASK_ID,
EMBED_BWD_TASK_ID,
GATHER_INIT_TASK_ID,
GATHER_FWD_TASK_ID,
@@ -96,19 +106,41 @@ enum TaskIDs {
BATCHMATMUL_BWD_TASK_ID,
LAYERNORM_INIT_TASK_ID,
LAYERNORM_FWD_TASK_ID,
+ LAYERNORM_INF_TASK_ID,
LAYERNORM_BWD_TASK_ID,
+ LAYERNORM_PEFT_BWD_TASK_ID,
+ RESIDUAL_LAYERNORM_INIT_TASK_ID,
+ RESIDUAL_LAYERNORM_INF_TASK_ID,
+ RESIDUAL_LAYERNORM_BWD_TASK_ID,
+ RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
+ ADD_BIAS_RESIDUAL_LAYERNORM_INIT_TASK_ID,
+ ADD_BIAS_RESIDUAL_LAYERNORM_INF_TASK_ID,
+ ADD_BIAS_RESIDUAL_LAYERNORM_BWD_TASK_ID,
+ ADD_BIAS_RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
+ SIGMOID_SILU_MULTI_INIT_TASK_ID,
+ SIGMOID_SILU_MULTI_INF_TASK_ID,
+ SIGMOID_SILU_MULTI_BWD_TASK_ID,
+ SIGMOID_SILU_MULTI_PEFT_BWD_TASK_ID,
LINEAR_INIT_TASK_ID,
LINEAR_INIT_PARA_TASK_ID,
+ LINEAR_INF_TASK_ID,
+ LINEAR_PEFT_BWD_TASK_ID,
LINEAR_FWD_TASK_ID,
LINEAR_BWD_TASK_ID,
LINEAR_BWD2_TASK_ID,
LINEAR_UPD_TASK_ID,
+ LORA_LINEAR_INIT_TASK_ID,
+ LORA_LINEAR_REG_TASK_ID,
+ LORA_LINEAR_INF_TASK_ID,
+ LORA_LINEAR_PEFT_BWD_TASK_ID,
FLAT_INIT_TASK_ID,
FLAT_FWD_TASK_ID,
FLAT_BWD_TASK_ID,
SOFTMAX_INIT_TASK_ID,
SOFTMAX_FWD_TASK_ID,
SOFTMAX_BWD_TASK_ID,
+ SOFTMAX_INF_TASK_ID,
+ SOFTMAX_PEFT_BWD_TASK_ID,
CONCAT_INIT_TASK_ID,
CONCAT_FWD_TASK_ID,
CONCAT_BWD_TASK_ID,
@@ -127,16 +159,46 @@ enum TaskIDs {
TOPK_INIT_TASK_ID,
TOPK_FWD_TASK_ID,
TOPK_BWD_TASK_ID,
+ ARG_TOPK_INIT_TASK_ID,
+ ARG_TOPK_INF_TASK_ID,
+ ARG_TOPK_INF_SPECULATIVE_TASK_ID,
+ SAMPLING_INIT_TASK_ID,
+ SAMPLING_INF_TASK_ID,
+ ARGMAX_INIT_TASK_ID,
+ ARGMAX_BEAM_INF_TASK_ID,
+ ARGMAX_NORM_INF_TASK_ID,
TRANSPOSE_INIT_TASK_ID,
TRANSPOSE_FWD_TASK_ID,
TRANSPOSE_BWD_TASK_ID,
ATTENTION_INIT_TASK_ID,
ATTENTION_FWD_TASK_ID,
ATTENTION_BWD_TASK_ID,
+ RMSNORM_INIT_TASK_ID,
+ RMSNORM_FWD_TASK_ID,
+ RMSNORM_INF_TASK_ID,
+ RMSNORM_BWD_TASK_ID,
+ RMSNORM_PEFT_BWD_TASK_ID,
+ RESIDUAL_RMSNORM_INIT_TASK_ID,
+ RESIDUAL_RMSNORM_INF_TASK_ID,
+ RESIDUAL_RMSNORM_BWD_TASK_ID,
+ RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID,
+ BEAM_TOPK_INIT_TASK_ID,
+ BEAM_TOPK_INF_TASK_ID,
+ INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID,
+ INC_MULTIHEAD_SELF_ATTENTION_FWD_TASK_ID,
+ INC_MULTIHEAD_SELF_ATTENTION_BWD_TASK_ID,
+ INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID,
+ INC_MULTIHEAD_SELF_ATTENTION_PEFT_BWD_TASK_ID,
+ SPEC_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID,
+ SPEC_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID,
+ TREE_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID,
+ TREE_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID,
MSELOSS_BWD_TASK_ID,
FUSEDOP_INIT_TASK_ID,
+ FUSEDOP_PEFT_BWD_TASK_ID,
FUSEDOP_FWD_TASK_ID,
FUSEDOP_BWD_TASK_ID,
+ FUSEDOP_INF_TASK_ID,
NOOP_INIT_TASK_ID,
// Metrics tasks
METRICS_COMP_TASK_ID,
@@ -160,6 +222,7 @@ enum TaskIDs {
// NCCL tasks
NCCL_GETUNIQUEID_TASK_ID,
NCCL_INIT_COMMS_TASK_ID,
+ NCCL_FINISH_COMMS_TASK_ID,
// Search
STRATEGY_SEARCH_TASK_ID,
// Graph
@@ -180,19 +243,41 @@ enum TaskIDs {
REPARTITION_BWD_TASK_ID,
COMBINE_INIT_TASK_ID,
COMBINE_FWD_TASK_ID,
+ COMBINE_INF_TASK_ID,
COMBINE_BWD_TASK_ID,
+ COMBINE_PEFT_BWD_TASK_ID,
REPLICATE_INIT_TASK_ID,
REPLICATE_FWD_TASK_ID,
REPLICATE_BWD_TASK_ID,
+ REPLICATE_PEFT_BWD_TASK_ID,
REDUCTION_INIT_TASK_ID,
REDUCTION_FWD_TASK_ID,
REDUCTION_BWD_TASK_ID,
PIPELINE_INIT_TASK_ID,
PIPELINE_FWD_TASK_ID,
PIPELINE_BWD_TASK_ID,
+ ALLREDUCE_INIT_TASK_ID,
+ ALLREDUCE_FWD_TASK_ID,
+ ALLREDUCE_BWD_TASK_ID,
+ ALLREDUCE_INF_TASK_ID,
+ ALLREDUCE_PEFT_BWD_TASK_ID,
+ PARALLEL_IDENTITY_INIT_TASK_ID,
+ PARALLEL_IDENTITY_FWD_TASK_ID,
+ PARALLEL_IDENTITY_BWD_TASK_ID,
+ PARALLEL_IDENTITY_INF_TASK_ID,
+ PARALLEL_IDENTITY_PEFT_BWD_TASK_ID,
FUSED_PARALLELOP_INIT_TASK_ID,
FUSED_PARALLELOP_FWD_TASK_ID,
FUSED_PARALLELOP_BWD_TASK_ID,
+ // InferenceManager & RequestManager
+ RM_LOAD_TOKENS_TASK_ID,
+ RM_LOAD_POSITION_TASK_ID,
+ RM_LOAD_BATCH_CONFIG_TASK_ID,
+ RM_PREPARE_NEXT_BATCH_TASK_ID,
+ RM_PREPARE_NEXT_BATCH_INIT_TASK_ID,
+ RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID,
+ RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID,
+ RM_BACKGROUND_SERVING_TASK_ID,
// Custom tasks
CUSTOM_GPU_TASK_ID_FIRST,
CUSTOM_GPU_TASK_ID_1,
@@ -216,6 +301,8 @@ enum TaskIDs {
// Make sure PYTHON_TOP_LEVEL_TASK_ID is
// consistent with python/main.cc
PYTHON_TOP_LEVEL_TASK_ID = 11111,
+ // Tensor Equal Task
+ TENSOR_EQUAL_TASK_ID,
};
enum ShardingID {
@@ -259,26 +346,44 @@ class Dropout;
class ElementBinary;
class ElementUnary;
class Embedding;
+class Experts;
class Flat;
class Gather;
class Group_by;
class LayerNorm;
+class ResidualLayerNorm;
+class AddBiasResidualLayerNorm;
+class SigmoidSiluMulti;
class Linear;
+class LoraLinear;
class MultiHeadAttention;
+class IncMultiHeadSelfAttention;
+class TreeIncMultiHeadSelfAttention;
class Pool2D;
class Reduce;
class Reshape;
class Softmax;
class Split;
class TopK;
+class ArgTopK;
class Transpose;
+class RMSNorm;
+class ResidualRMSNorm;
+class BeamTopK;
+class SpecIncMultiHeadSelfAttention;
+class Sampling;
+class ArgMax;
class Combine;
class Repartition;
class Reduction;
class Replicate;
+class AllReduce;
+class ParallelIdentity;
class FusedParallelOp;
class ParallelOpInfo;
+struct Request;
+
// TODO: Move to an appropriate place
/*
This is used to create a type that recursively replaces value type
@@ -325,12 +430,14 @@ std::vector
class FFModel {
public:
- FFModel(FFConfig &config);
+ FFModel(FFConfig &config, bool cpu_offload = false);
+ ~FFModel();
static constexpr float PROPAGATION_CHANCE = 0.25;
static constexpr float CONTINUE_PROPAGATION_CHANCE = 0.75;
static constexpr float PROPAGATION_SIZE_WEIGHT = 1.0;
+ bool cpu_offload;
// C++ APIs for constructing models
// Add an exp layer
Tensor exp(const Tensor x, char const *name = NULL);
@@ -422,7 +529,7 @@ class FFModel {
char const *name = NULL);
// Add an embedding layer
Tensor embedding(const Tensor input,
- int num_entires,
+ int num_entries,
int outDim,
AggrMode aggr,
DataType dtype = DT_FLOAT,
@@ -468,12 +575,43 @@ class FFModel {
PoolType type = POOL_MAX,
ActiMode activation = AC_MODE_NONE,
char const *name = NULL);
- // Add a batch_norm layer
+ // Add a layer_norm layer
Tensor layer_norm(const Tensor input,
std::vector const &axes,
bool elementwise_affine,
float eps,
+ bool use_bias = true,
+ DataType data_type = DT_NONE,
char const *name = NULL);
+ // Add a layer_norm layer with residual(s)
+ void residual_layer_norm(const Tensor input,
+ const Tensor residual1,
+ const Tensor residual2,
+ Tensor *outputs,
+ bool use_two_residuals,
+ std::vector const &axes,
+ bool elementwise_affine,
+ float eps,
+ bool use_bias = true,
+ bool inplace_residual = false,
+ DataType data_type = DT_NONE,
+ char const *name = NULL);
+ // Add a add_bias_residual_layer_norm layer
+ void add_bias_residual_layer_norm(const Tensor input,
+ const Tensor residual,
+ Tensor *outputs,
+ std::vector const &axes,
+ bool elementwise_affine,
+ float eps,
+ bool use_bias = true,
+ bool inplace_residual = false,
+ DataType data_type = DT_NONE,
+ char const *name = NULL);
+ // Add a sigmoid_silu_multi layer
+ Tensor sigmoid_silu_multi(const Tensor input1,
+ const Tensor input2,
+ DataType data_type = DT_NONE,
+ char const *name = NULL);
// Add a batch_norm layer
Tensor
batch_norm(const Tensor input, bool relu = true, char const *name = NULL);
@@ -483,12 +621,33 @@ class FFModel {
int a_seq_length_dim = -1,
int b_seq_length_dim = -1,
char const *name = nullptr);
+ // Add a root mean square layer
+ Tensor rms_norm(const Tensor input,
+ float eps,
+ int dim,
+ DataType data_type = DT_NONE,
+ char const *name = NULL);
+ // Add a residual root mean square layer
+ void residual_rms_norm(const Tensor input1,
+ const Tensor input2,
+ Tensor *outputs,
+ float eps,
+ int dim,
+ bool inplace_residual = false,
+ DataType data_type = DT_NONE,
+ char const *name = NULL);
+ // Add a beam search top k layer
+ Tensor beam_top_k(const Tensor input,
+ int max_beam_size,
+ bool sorted,
+ char const *name = NULL);
+
// Add a dense layer
Tensor dense(const Tensor input,
int outDim,
ActiMode activation = AC_MODE_NONE,
bool use_bias = true,
- DataType data_type = DT_FLOAT,
+ DataType data_type = DT_NONE,
Layer const *shared_op = NULL,
Initializer *kernel_initializer = NULL,
Initializer *bias_initializer = NULL,
@@ -500,6 +659,16 @@ class FFModel {
// Add a concat layer
Tensor
concat(int n, Tensor const *tensors, int axis, char const *name = NULL);
+ // Add an experts layer
+ Tensor experts(
+ Tensor const *inputs,
+ int num_experts,
+ int experts_start_idx,
+ int experts_output_dim_size,
+ float alpha,
+ int experts_num_layers = 1, // number of linear layers per expert
+ int experts_internal_dim_size = 0, // hidden dimension for internal layers
+ char const *name = NULL);
// Add a mean layer
Tensor mean(const Tensor input,
std::vector const &dims,
@@ -521,7 +690,10 @@ class FFModel {
// Add a flat layer
Tensor flat(const Tensor input, char const *name = NULL);
// Add a softmax layer
- Tensor softmax(const Tensor input, int dim = -1, char const *name = NULL);
+ Tensor softmax(const Tensor input,
+ int dim = -1,
+ DataType data_type = DT_NONE,
+ char const *name = NULL);
// Create input tensors and constants
Tensor transpose(const Tensor input,
std::vector const &perm,
@@ -539,6 +711,14 @@ class FFModel {
int k,
bool sorted,
char const *name = NULL);
+ Tensor arg_top_k(const Tensor input,
+ // Tensor *outputs,
+ int k,
+ bool sorted,
+ bool speculative_decoding,
+ char const *name = NULL);
+ Tensor argmax(const Tensor input, bool beam_search, char const *name = NULL);
+ Tensor sampling(const Tensor input, float top_p, char const *name = NULL);
Tensor multihead_attention(const Tensor query,
const Tensor key,
const Tensor value,
@@ -550,8 +730,127 @@ class FFModel {
bool bias = true,
bool add_bias_kv = false,
bool add_zero_attn = false,
+ DataType data_type = DT_NONE,
Initializer *kernel_initializer = NULL,
char const *name = NULL);
+ Tensor inc_multihead_self_attention(const Tensor input,
+ int embed_dim,
+ int num_heads,
+ int kdim = 0,
+ int vdim = 0,
+ float dropout = 0.0f,
+ bool bias = false,
+ bool add_bias_kv = false,
+ bool add_zero_attn = false,
+ DataType data_type = DT_NONE,
+ Initializer *kernel_initializer = NULL,
+ bool apply_rotary_embedding = false,
+ bool scaling_query = false,
+ float scaling_factor = 1.0f,
+ bool qk_prod_scaling = true,
+ bool position_bias = false,
+ char const *name = NULL);
+ Tensor
+ spec_inc_multihead_self_attention(const Tensor input,
+ int embed_dim,
+ int num_heads,
+ int kdim = 0,
+ int vdim = 0,
+ float dropout = 0.0f,
+ bool bias = false,
+ bool add_bias_kv = false,
+ bool add_zero_attn = false,
+ DataType data_type = DT_NONE,
+ Initializer *kernel_initializer = NULL,
+ bool apply_rotary_embedding = false,
+ bool scaling_query = false,
+ float scaling_factor = 1.0f,
+ bool qk_prod_scaling = true,
+ bool position_bias = false,
+ char const *name = NULL);
+ Tensor inc_multihead_self_attention_verify(
+ const Tensor input,
+ int embed_dim,
+ int num_heads,
+ int kdim = 0,
+ int vdim = 0,
+ float dropout = 0.0f,
+ bool bias = false,
+ bool add_bias_kv = false,
+ bool add_zero_attn = false,
+ DataType data_type = DT_NONE,
+ Initializer *kernel_initializer = NULL,
+ bool apply_rotary_embedding = false,
+ bool scaling_query = false,
+ float scaling_factor = 1.0f,
+ bool qk_prod_scaling = true,
+ bool position_bias = false,
+ char const *name = NULL);
+ Tensor inc_multiquery_self_attention(const Tensor input,
+ int embed_dim,
+ int num_q_heads,
+ int num_kv_heads,
+ int kdim = 0,
+ int vdim = 0,
+ float dropout = 0.0f,
+ bool bias = false,
+ bool add_bias_kv = false,
+ bool add_zero_attn = false,
+ DataType data_type = DT_NONE,
+ Initializer *kernel_initializer = NULL,
+ bool apply_rotary_embedding = false,
+ bool scaling_query = false,
+ float scaling_factor = 1.0f,
+ bool qk_prod_scaling = true,
+ bool position_bias = false,
+ char const *name = NULL);
+ Tensor
+ spec_inc_multiquery_self_attention(const Tensor input,
+ int embed_dim,
+ int num_q_heads,
+ int num_kv_heads,
+ int kdim = 0,
+ int vdim = 0,
+ float dropout = 0.0f,
+ bool bias = false,
+ bool add_bias_kv = false,
+ bool add_zero_attn = false,
+ DataType data_type = DT_NONE,
+ Initializer *kernel_initializer = NULL,
+ bool apply_rotary_embedding = false,
+ bool scaling_query = false,
+ float scaling_factor = 1.0f,
+ bool qk_prod_scaling = true,
+ bool position_bias = false,
+ char const *name = NULL);
+ Tensor inc_multiquery_self_attention_verify(
+ const Tensor input,
+ int embed_dim,
+ int num_q_heads,
+ int num_kv_heads,
+ int kdim = 0,
+ int vdim = 0,
+ float dropout = 0.0f,
+ bool bias = false,
+ bool add_bias_kv = false,
+ bool add_zero_attn = false,
+ DataType data_type = DT_NONE,
+ Initializer *kernel_initializer = NULL,
+ bool apply_rotary_embedding = false,
+ bool scaling_query = false,
+ float scaling_factor = 1.0f,
+ bool qk_prod_scaling = true,
+ bool position_bias = false,
+ char const *name = NULL);
+ // ========================================
+ // PEFT Layers
+ // ========================================
+ PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config);
+ // ========================================
+ // Inference APIs
+ // ========================================
+ std::vector generate(std::vector const &requests);
+
Tensor create_tensor_legion_ordering(int num_dim,
int const dims[],
DataType data_type,
@@ -683,6 +982,7 @@ class FFModel {
auto input_shapes = get_input_shape(input);
if (!params.is_valid(input_shapes)) {
+ printf("!params.is_valid(input_shapes)\n");
return PCG::Node::INVALID_NODE;
}
@@ -690,7 +990,7 @@ class FFModel {
std::pair::type, Params> key{
input_shapes, params};
- auto &cache = get::type, Params>,
T *>>(this->cached_ops);
auto const &it = cache.find(key);
@@ -765,16 +1065,29 @@ class FFModel {
std::vector const ®ions,
Legion::Context ctx,
Legion::Runtime *runtime);
+ // ========================================
+ // Internal APIs that should not be invoked from applications
+ // ========================================
void reset_metrics();
void init_operators();
+ void init_operators_inference(
+ std::vector const &batch_inputs,
+ std::vector const &batch_outputs);
void prefetch();
void forward(int seq_length = -1);
void compute_metrics();
void get_metrics();
void backward(int seq_length = -1);
void update();
- bool apply_fusion(std::vector const &operators,
- std::vector &new_operators);
+ bool apply_fusion(
+ std::vector const &operators,
+ std::vector &new_operators,
+ std::unordered_map>
+ *parallel_tensor_mapping = nullptr);
+ bool check_operators_integrity(
+ std::vector const &old_operators,
+ std::unordered_map>
+ *pt_mapping = nullptr);
Op *get_final_operator() const;
void compile(LossType loss_type,
std::vector const &metrics,
@@ -783,6 +1096,9 @@ class FFModel {
LossType loss_type,
std::vector const &metrics,
CompMode comp_mode = COMP_MODE_TRAINING);
+ void compile_inference();
+ void set_transformer_layer_id(int id);
+ void set_position_offset(int offset);
void graph_optimize(size_t budget,
bool only_data_parallel,
std::unique_ptr &best_graph,
@@ -801,6 +1117,7 @@ class FFModel {
bool use_propagation) const;
#ifdef FF_USE_NCCL
ncclComm_t *find_nccl_comms(MachineView const &view) const;
+ void finish_nccl_comms();
#endif
#ifdef FF_USE_PROPAGATE
void propagate(std::map const ¤t,
@@ -816,7 +1133,7 @@ class FFModel {
std::unordered_map>>
get_bwd_edge_map() const;
- // Internal funcitons
+ // Internal functions
Legion::IndexSpace get_or_create_task_is(ParallelConfig const &pc);
Legion::IndexSpace get_or_create_task_is(MachineView const &view);
Legion::IndexSpace get_or_create_task_is(Legion::Domain const &domain);
@@ -824,6 +1141,10 @@ class FFModel {
Legion::IndexSpace get_task_is(Legion::Domain const &domain) const;
Legion::IndexSpace get_task_is(ParallelConfig const &pc) const;
Legion::IndexSpace get_task_is(MachineView const &view) const;
+ bool need_to_add_combine(int layer_idx) const;
+ bool need_to_add_allreduce(int layer_idx) const;
+ bool need_to_add_parallel_identity(int layer_idx) const;
+ bool is_mlp_block(int layer_idx) const;
void create_operators_from_layers();
Op *create_operator_from_layer(Layer *layer,
std::vector const &inputs);
@@ -837,8 +1158,11 @@ class FFModel {
void clear_graph_search_cache();
public:
- size_t op_global_guid, layer_global_guid;
+ size_t op_global_guid, layer_global_guid, peft_model_global_guid;
size_t tensor_global_guid, parallel_tensor_global_guid, node_global_guid;
+ size_t current_transformer_layer_id;
+ // positional embedding start offset
+ int position_offset;
FFConfig config;
FFIterationConfig iter_config;
Optimizer *optimizer;
@@ -854,6 +1178,12 @@ class FFModel {
std::vector layers;
std::vector operators;
std::vector parameters;
+ // PEFT related
+ std::unordered_map base_layer_to_peft_layer;
+ std::unordered_map> peft_layer_to_peft_id;
+ std::unordered_map peft_configs;
+ // std::vector peft_operators;
+
FFHandler handlers[MAX_NUM_WORKERS];
Legion::Future current_metrics;
// Cached operators: key: operator hash, value: operator pointer
@@ -883,6 +1213,9 @@ class FFModel {
ElementUnary *>,
std::unordered_map,
Embedding *>,
+ std::unordered_map<
+ std::pair, ExpertsParams>,
+ Experts *>,
std::unordered_map, Flat *>,
std::unordered_map<
std::pair,
@@ -894,8 +1227,25 @@ class FFModel {
Group_by *>,
std::unordered_map,
LayerNorm *>,
+ std::unordered_map,
+ ResidualLayerNormParams>,
+ ResidualLayerNorm *>,
+ std::unordered_map<
+ std::pair,
+ AddBiasResidualLayerNormParams>,
+ AddBiasResidualLayerNorm *>,
+ std::unordered_map<
+ std::pair,
+ SigmoidSiluMultiParams>,
+ SigmoidSiluMulti *>,
std::unordered_map,
Linear *>,
+ std::unordered_map<
+ std::pair,
+ LoraLinearParams>,
+ LoraLinear *>,
std::unordered_map,
Pool2D *>,
std::unordered_map,
MultiHeadAttentionParams>,
MultiHeadAttention *>,
+ std::unordered_map<
+ std::pair,
+ IncMultiHeadSelfAttention *>,
+ std::unordered_map,
+ BeamTopK *>,
+ std::unordered_map,
+ Sampling *>,
+ std::unordered_map,
+ ArgMax *>,
+ std::unordered_map<
+ std::pair,
+ SpecIncMultiHeadSelfAttention *>,
+ std::unordered_map<
+ std::pair,
+ TreeIncMultiHeadSelfAttention *>,
std::unordered_map,
Reduce *>,
std::unordered_map,
@@ -911,8 +1276,16 @@ class FFModel {
std::unordered_map,
Softmax *>,
std::unordered_map, TopK *>,
+ std::unordered_map,
+ ArgTopK *>,
std::unordered_map,
Transpose *>,
+ std::unordered_map,
+ RMSNorm *>,
+ std::unordered_map<
+ std::pair,
+ ResidualRMSNormParams>,
+ ResidualRMSNorm *>,
std::unordered_map,
Repartition *>,
std::unordered_map,
@@ -921,12 +1294,18 @@ class FFModel {
Reduction *>,
std::unordered_map,
Combine *>,
+ std::unordered_map,
+ AllReduce *>,
+ std::unordered_map,
+ ParallelIdentity *>,
std::unordered_map,
FusedParallelOp *>>
cached_ops;
std::unordered_map cached_noop_ops;
std::unordered_map cached_input_ops;
std::vector all_valid_views;
+ int model_id; // unique incremental id assigned to each model. Used in the
+ // inference_debugging mode.
#ifdef FF_USE_NCCL
std::unordered_map view_hash_to_nccl_comms;
#endif
@@ -955,6 +1334,9 @@ class FFModel {
ElementUnary *
unary(OperatorType op, char const *name = NULL, float scalar = 0.0);
PCG::Node new_node(Op *);
+ static int model_counter; // number of instantiated FFModel objects. Used to
+ // assign a unique incremental id to each model.
+ // Used in the inference_debugging mode.
};
class UtilityTasks {
diff --git a/include/flexflow/op_meta.h b/include/flexflow/op_meta.h
index 512844db92..d31c12b16c 100644
--- a/include/flexflow/op_meta.h
+++ b/include/flexflow/op_meta.h
@@ -9,13 +9,19 @@ class Op;
class OpMeta {
public:
- OpMeta(FFHandler _handle);
+ // OpMeta(FFHandler _handle);
OpMeta(FFHandler _handle, Op const *op);
public:
FFHandler handle;
bool profiling; // Measure the run time of the task
- bool trainableInputs[MAX_NUM_INPUTS];
+ bool inference_debugging;
+ int decoding_step;
+ int bwd_step;
+ char op_name[MAX_OPNAME];
+ LayerID layer_guid;
+ bool trainable_inputs[MAX_NUM_INPUTS];
+ bool reset_input_grads[MAX_NUM_INPUTS];
DataType input_type[MAX_NUM_INPUTS];
DataType weight_type[MAX_NUM_WEIGHTS];
DataType output_type[MAX_NUM_OUTPUTS];
diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h
index 3fd84ce55b..1a5af67b36 100644
--- a/include/flexflow/operator.h
+++ b/include/flexflow/operator.h
@@ -1,15 +1,27 @@
#ifndef _OPERATOR_H
#define _OPERATOR_H
+#include "flexflow/accessor.h"
+#include "flexflow/batch_config.h"
#include "flexflow/fftype.h"
#include "flexflow/machine_view.h"
#include "flexflow/parallel_tensor.h"
#include "flexflow/utils/dot/record_formatter.h"
+#include
#include
+namespace fs = std::filesystem;
+
+#include
+#include
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+#include "flexflow/utils/cuda_helper.h"
+#else
+#include "flexflow/utils/hip_helper.h"
+#endif
namespace FlexFlow {
-extern LegionRuntime::Logger::Category log_measure;
+extern Legion::Logger log_measure;
class OpMeta;
class Simulator;
@@ -19,11 +31,38 @@ enum class MappingRecordType { INPUT_OUTPUT, INPUT_WEIGHT };
enum class MappingOperation { PARTITION, REPLICATE };
+fs::path get_dst_folder(std::string const &subdir,
+ int step_idx = 0,
+ int shard_idx = 0,
+ bool before_kernel = false);
+
+/** @brief A class to keep track of a dimension relation between two tensors
+ * used by an operator.
+ *
+ * Dimension relations are one-to-one mappings between the dimensions of the
+ * input, weights, and output tensors of an operator. Introduced in the Unity
+ * paper, dimension relations allow FlexFlow to keep track of an operator's
+ * parallelization plans as part of the Parallel Computation Graph (PCG).
+ *
+ * Each ParallelDimMappingRecord only keeps track of a single dimension
+ * relation.
+ *
+ * ParallelDimMappingRecord objects must be initialized with a
+ * MappingRecordType, which can be INPUT_OUTPUT, if the ParallelDimMappingRecord
+ * is tracking a dimension relation between the input and the output tensor, or
+ * INPUT_WEIGHT, if the ParallelDimMappingRecord is tracking a dimension
+ * relation between the input tensor and the weights tensor.
+ *
+ */
class ParallelDimMappingRecord {
private:
ParallelDimMappingRecord(MappingRecordType);
public:
+ /**
+ * @brief We disable this constructor because ParallelDimMappingRecord objects
+ * must specify the MappingRecordType upon creation.
+ */
ParallelDimMappingRecord() = delete;
static ParallelDimMappingRecord input_output_record(
@@ -160,6 +199,7 @@ class Op {
const ParallelTensor input4 = NULL);
Op(int guid,
bool profiling,
+ bool inference_debugging,
OperatorType otype,
DataType dtype,
char const *name,
@@ -185,9 +225,182 @@ class Op {
virtual bool get_weight_parameter(TNParameter, DIMParameter, int *) const;
// Pure virtual functions that must be implemented
virtual void init(FFModel const &) = 0;
+ virtual void init_inference(FFModel const &,
+ std::vector const &,
+ std::vector const &,
+ MachineView const *mv = nullptr) {
+ assert(false);
+ };
virtual void forward(FFModel const &) = 0;
virtual void backward(FFModel const &) = 0;
+ // Pure virtual functions for inference
+ virtual Legion::FutureMap inference(FFModel const &,
+ BatchConfigFuture const &,
+ std::vector const &,
+ std::vector const &,
+ MachineView const *mv = nullptr) {
+ assert(false);
+ Legion::FutureMap empty_map;
+ return empty_map;
+ };
+ virtual Legion::FutureMap peft_bwd(FFModel const &,
+ BatchConfigFuture const &,
+ std::vector const &,
+ std::vector const &,
+ MachineView const *mv = nullptr) {
+ assert(false);
+ }
virtual void print_layer(FFModel const &model) = 0;
+ template
+ static std::string get_op_name_without_uid(OpMetaType *m) {
+ std::string op_name_without_uid = std::string(m->op_name);
+ size_t last_underscore = op_name_without_uid.length();
+ for (int i = op_name_without_uid.length() - 1; i > 0; i--) {
+ if (!(std::isdigit(m->op_name[i]) || m->op_name[i] == '_')) {
+ break;
+ } else if (m->op_name[i] == '_') {
+ last_underscore = i;
+ }
+ }
+ if (last_underscore < op_name_without_uid.length()) {
+ op_name_without_uid.erase(last_underscore);
+ }
+ return op_name_without_uid;
+ }
+ template
+ static void save_inference_tensors_to_file(
+ OpMetaType *m,
+ int shard_id,
+ BatchConfig const *bc,
+ std::vector input_tensors,
+ std::vector weight_tensors,
+ std::vector output_tensors,
+ bool fwd_pass = true,
+ bool before_kernel = false) {
+ // get operator name and print it
+ std::string op_name_without_uid = get_op_name_without_uid(m);
+ std::cout << (fwd_pass ? "INF " : "BWD ") << op_name_without_uid
+ << std::endl;
+ // build the path to save the tensor
+ fs::path dst_filepath;
+ if (fwd_pass) {
+ dst_filepath =
+ get_dst_folder("fwd", m->decoding_step, shard_id, before_kernel);
+ } else {
+ dst_filepath =
+ get_dst_folder("bwd", m->bwd_step, shard_id, before_kernel);
+ }
+ if (m->layer_guid.model_id > 0) {
+ assert(false && "Model ID > 0 not supported yet");
+ }
+ std::string layername = "layers." +
+ std::to_string(m->layer_guid.transformer_layer_id) +
+ "." + op_name_without_uid;
+ dst_filepath /= layername;
+
+ // save batch config, if passed
+ if (bc != nullptr) {
+ bc->save_to_file(dst_filepath.string() + ".batch_config");
+ }
+
+ // save all inputs
+ for (int i = 0; i < input_tensors.size(); i++) {
+ std::string filename = dst_filepath.string() + ".input_";
+ if (fwd_pass) {
+ filename += std::to_string(i);
+ } else {
+ filename += "gradient_" + std::to_string(i);
+ }
+ if (input_tensors[i].data_type == DT_FLOAT) {
+ save_tensor(input_tensors[i].get_float_ptr(),
+ input_tensors[i].domain.get_volume(),
+ filename.c_str());
+ } else if (input_tensors[i].data_type == DT_HALF) {
+ save_tensor(input_tensors[i].get_half_ptr(),
+ input_tensors[i].domain.get_volume(),
+ filename.c_str());
+ } else if (input_tensors[i].data_type == DT_INT32) {
+ save_tensor(input_tensors[i].get_int32_ptr(),
+ input_tensors[i].domain.get_volume(),
+ filename.c_str());
+ } else if (input_tensors[i].data_type == DT_INT64) {
+ save_tensor(input_tensors[i].get_int64_ptr(),
+ input_tensors[i].domain.get_volume(),
+ filename.c_str());
+ } else {
+ assert(false && "Tensor data type not supported");
+ }
+ }
+
+ // only dump the weights in the forward pass, at the first step
+ // note that we do not save the weight gradients, since we only support
+ // finetuning LoRA weights, which are not FF tensors.
+ if (fwd_pass && m->decoding_step == 0) {
+ fs::path dst_filepath_weights =
+ get_dst_folder("weights", m->decoding_step, shard_id, before_kernel) /
+ layername;
+ for (int i = 0; i < weight_tensors.size(); i++) {
+ std::string filename =
+ dst_filepath_weights.string() + ".weight_" + std::to_string(i);
+ if (weight_tensors[i].data_type == DT_FLOAT) {
+ save_tensor(weight_tensors[i].get_float_ptr(),
+ weight_tensors[i].domain.get_volume(),
+ filename.c_str());
+ } else if (weight_tensors[i].data_type == DT_HALF) {
+ save_tensor(weight_tensors[i].get_half_ptr(),
+ weight_tensors[i].domain.get_volume(),
+ filename.c_str());
+ } else if (weight_tensors[i].data_type == DT_INT32) {
+ save_tensor(weight_tensors[i].get_int32_ptr(),
+ weight_tensors[i].domain.get_volume(),
+ filename.c_str());
+ } else if (weight_tensors[i].data_type == DT_INT64) {
+ save_tensor(weight_tensors[i].get_int64_ptr(),
+ weight_tensors[i].domain.get_volume(),
+ filename.c_str());
+ } else {
+ assert(false && "Tensor data type not supported");
+ }
+ }
+ }
+
+ // save all outputs
+ for (int i = 0; i < output_tensors.size(); i++) {
+ std::string filename = dst_filepath.string() + ".output_";
+ if (fwd_pass) {
+ filename += std::to_string(i);
+ } else {
+ filename += "gradient_" + std::to_string(i);
+ }
+ if (output_tensors[i].data_type == DT_FLOAT) {
+ save_tensor(output_tensors[i].get_float_ptr(),
+ output_tensors[i].domain.get_volume(),
+ filename.c_str());
+ } else if (output_tensors[i].data_type == DT_HALF) {
+ save_tensor(output_tensors[i].get_half_ptr(),
+ output_tensors[i].domain.get_volume(),
+ filename.c_str());
+ } else if (output_tensors[i].data_type == DT_INT32) {
+ save_tensor(output_tensors[i].get_int32_ptr(),
+ output_tensors[i].domain.get_volume(),
+ filename.c_str());
+ } else if (output_tensors[i].data_type == DT_INT64) {
+ save_tensor(output_tensors[i].get_int64_ptr(),
+ output_tensors[i].domain.get_volume(),
+ filename.c_str());
+ } else {
+ assert(false && "Tensor data type not supported");
+ }
+ }
+ // increase count of decoding steps
+ if (!before_kernel) {
+ if (fwd_pass) {
+ m->decoding_step++;
+ } else {
+ m->bwd_step++;
+ }
+ }
+ }
virtual bool measure_operator_cost(Simulator *sim,
MachineView const &mv,
CostMetrics &cost_metrics) const = 0;
@@ -239,15 +452,29 @@ class Op {
std::vector const ®ions,
Legion::Context ctx,
Legion::Runtime *runtime);
+ static void
+ finish_nccl_comms_task(Legion::Task const *task,
+ std::vector const ®ions,
+ Legion::Context ctx,
+ Legion::Runtime *runtime);
#endif
protected:
void set_argumentmap_for_init(FFModel const &ff, Legion::ArgumentMap &argmap);
+ void set_argumentmap_for_init_inference(FFModel const &ff,
+ Legion::ArgumentMap &argmap,
+ ParallelTensor const output0);
void set_argumentmap_for_forward(FFModel const &ff,
Legion::ArgumentMap &argmap);
+ void set_argumentmap_for_inference(FFModel const &ff,
+ Legion::ArgumentMap &argmap,
+ ParallelTensor const output0);
void set_argumentmap_for_backward(FFModel const &ff,
Legion::ArgumentMap &argmap);
void set_opmeta_from_futuremap(FFModel const &ff,
Legion::FutureMap const &fm);
+ void set_opmeta_from_futuremap_inference(FFModel const &ff,
+ Legion::FutureMap const &fm,
+ ParallelTensor const output0);
void solve_parallel_dim_mappings(
std::vector const &inputs,
std::vector const &weights,
@@ -265,10 +492,14 @@ class Op {
ParallelTensor outputs[MAX_NUM_OUTPUTS];
ParallelTensor inputs[MAX_NUM_INPUTS];
ParallelParameter weights[MAX_NUM_WEIGHTS];
- bool trainableInputs[MAX_NUM_INPUTS];
+ bool trainable_inputs[MAX_NUM_INPUTS];
+ bool reset_input_grads[MAX_NUM_INPUTS];
OpMeta *meta[MAX_NUM_WORKERS];
+ std::map inference_meta;
int numInputs, numWeights, numOutputs;
bool profiling;
+ bool inference_debugging;
+ bool add_bias_only_once;
#ifdef FF_USE_NCCL
ncclUniqueId ncclId;
#endif
diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h
index 24c84a85ed..673f78ad46 100644
--- a/include/flexflow/operator_params.h
+++ b/include/flexflow/operator_params.h
@@ -1,10 +1,14 @@
#ifndef _OPERATOR_PARAMS_H
#define _OPERATOR_PARAMS_H
+#include "flexflow/ops/add_bias_residual_layer_norm_params.h"
#include "flexflow/ops/aggregate_params.h"
#include "flexflow/ops/aggregate_spec_params.h"
+#include "flexflow/ops/arg_topk_params.h"
+#include "flexflow/ops/argmax_params.h"
#include "flexflow/ops/attention_params.h"
#include "flexflow/ops/batch_matmul_params.h"
+#include "flexflow/ops/beam_topk_params.h"
#include "flexflow/ops/cast_params.h"
#include "flexflow/ops/concat_params.h"
#include "flexflow/ops/conv_2d_params.h"
@@ -12,20 +16,32 @@
#include "flexflow/ops/element_binary_params.h"
#include "flexflow/ops/element_unary_params.h"
#include "flexflow/ops/embedding_params.h"
+#include "flexflow/ops/experts_params.h"
#include "flexflow/ops/flat_params.h"
#include "flexflow/ops/gather_params.h"
#include "flexflow/ops/groupby_params.h"
+#include "flexflow/ops/inc_multihead_self_attention_params.h"
#include "flexflow/ops/layer_norm_params.h"
#include "flexflow/ops/linear_params.h"
+#include "flexflow/ops/lora_linear_params.h"
#include "flexflow/ops/pool_2d_params.h"
#include "flexflow/ops/reduce_params.h"
#include "flexflow/ops/reshape_params.h"
+#include "flexflow/ops/residual_layer_norm_params.h"
+#include "flexflow/ops/residual_rms_norm_params.h"
+#include "flexflow/ops/rms_norm_params.h"
+#include "flexflow/ops/sampling_params.h"
+#include "flexflow/ops/sigmoid_silu_multi_params.h"
#include "flexflow/ops/softmax_params.h"
+#include "flexflow/ops/spec_inc_multihead_self_attention_params.h"
#include "flexflow/ops/split_params.h"
#include "flexflow/ops/topk_params.h"
#include "flexflow/ops/transpose_params.h"
+#include "flexflow/ops/tree_inc_multihead_self_attention_params.h"
+#include "flexflow/parallel_ops/allreduce_params.h"
#include "flexflow/parallel_ops/combine_params.h"
#include "flexflow/parallel_ops/fused_parallel_op_params.h"
+#include "flexflow/parallel_ops/parallel_identity_params.h"
#include "flexflow/parallel_ops/partition_params.h"
#include "flexflow/parallel_ops/reduction_params.h"
#include "flexflow/parallel_ops/replicate_params.h"
@@ -49,19 +65,34 @@ using OperatorParameters = mp::variant;
tl::optional get_op_parameters(Op const *op);
diff --git a/include/flexflow/ops/add_bias_residual_layer_norm.h b/include/flexflow/ops/add_bias_residual_layer_norm.h
new file mode 100644
index 0000000000..9510ac0f28
--- /dev/null
+++ b/include/flexflow/ops/add_bias_residual_layer_norm.h
@@ -0,0 +1,165 @@
+#pragma once
+
+#include "flexflow/inference.h"
+#include "flexflow/model.h"
+#include "flexflow/utils/memory_allocator.h"
+namespace FlexFlow {
+
+class AddBiasResidualLayerNormMeta;
+
+class AddBiasResidualLayerNorm : public Op {
+public:
+ using Params = AddBiasResidualLayerNormParams;
+ using Input = std::pair;
+ AddBiasResidualLayerNorm(FFModel &model,
+ Params const ¶ms,
+ Input const &inputs,
+ char const *name = nullptr,
+ bool allocate_weights = false);
+ AddBiasResidualLayerNorm(FFModel &model,
+ LayerID const &_layer_guid,
+ const ParallelTensor _input,
+ const ParallelTensor _residual,
+ std::vector const &axes,
+ bool _elementwise_affine,
+ bool _use_bias,
+ float _eps,
+ bool _inplace_residual,
+ bool allocate_weights,
+ char const *name);
+ void map_output_tensors(FFModel &ff) override;
+ void init(FFModel const &) override;
+ void init_inference(FFModel const &,
+ std::vector const &,
+ std::vector const &,
+ MachineView const *mv = nullptr) override;
+ void forward(FFModel const &) override;
+ void backward(FFModel const &) override;
+ Legion::FutureMap inference(FFModel const &,
+ BatchConfigFuture const &,
+ std::vector const &,
+ std::vector const &,
+ MachineView const *mv = nullptr) override;
+ Legion::FutureMap peft_bwd(FFModel const &,
+ BatchConfigFuture const &,
+ std::vector const &,
+ std::vector const &,
+ MachineView const *mv = nullptr) override;
+ void print_layer(FFModel const &model) override {
+ assert(0);
+ }
+ static Op *
+ create_operator_from_layer(FFModel &model,
+ Layer const *layer,
+ std::vector const &inputs);
+ void serialize(Legion::Serializer &) const override;
+ static PCG::Node deserialize(FFModel &ff,
+ Legion::Deserializer &d,
+ ParallelTensor inputs[],
+ int num_inputs);
+
+ AddBiasResidualLayerNormParams get_params() const;
+
+ static OpMeta *init_task(Legion::Task const *task,
+ std::vector const ®ions,
+ Legion::Context ctx,
+ Legion::Runtime *runtime);
+ static void inference_task(Legion::Task const *task,
+ std::vector const ®ions,
+ Legion::Context ctx,
+ Legion::Runtime *runtime);
+ static void backward_task(Legion::Task const *task,
+ std::vector const ®ions,
+ Legion::Context ctx,
+ Legion::Runtime *runtime);
+ static void peft_bwd_task(Legion::Task const *task,
+ std::vector const ®ions,
+ Legion::Context ctx,
+ Legion::Runtime *runtime);
+ bool measure_operator_cost(Simulator *sim,
+ MachineView const &pc,
+ CostMetrics &cost_metrics) const override;
+ template
+ static void inference_kernel(AddBiasResidualLayerNormMeta const *m,
+ int attn_bias_dim,
+ int residual_volume,
+ T const *input_ptr,
+ T const *attn_bias_ptr,
+ T const *residual_ptr,
+ T *added_output_ptr,
+ T *output_ptr,
+ T const *gamma_ptr,
+ T const *beta_ptr,
+ ffStream_t stream);
+ static void inference_kernel_wrapper(AddBiasResidualLayerNormMeta *m,
+ BatchConfig const *bc,
+ GenericTensorAccessorR const &input,
+ GenericTensorAccessorR const &attn_bias,
+ GenericTensorAccessorR const &residual,
+ GenericTensorAccessorW &added_output,
+ GenericTensorAccessorW &output,
+ GenericTensorAccessorR const &gamma,
+ GenericTensorAccessorR const &beta);
+ template
+ static void backward_kernel(AddBiasResidualLayerNormMeta const *m,
+ T const *output_grad_ptr,
+ T const *added_output_ptr,
+ T *input_grad_ptr,
+ T *residual_grad_ptr,
+ T *attn_bias_grad_ptr,
+ T const *gamma_ptr,
+ T *gamma_grad_ptr,
+ T *beta_grad_ptr,
+ ffStream_t stream);
+ static void
+ backward_kernel_wrapper(AddBiasResidualLayerNormMeta const *m,
+ GenericTensorAccessorR const &output_grad,
+ GenericTensorAccessorR &added_output,
+ GenericTensorAccessorW &input_grad,
+ GenericTensorAccessorW const &residual_grad,
+ GenericTensorAccessorW const &attn_bias_grad,
+ GenericTensorAccessorR const &gamma,
+ GenericTensorAccessorW const &gamma_grad,
+ GenericTensorAccessorW const &beta_grad);
+ template
+ static void peft_bwd_kernel(AddBiasResidualLayerNormMeta const *m,
+ T const *output_grad_ptr,
+ T *input_grad_ptr,
+ T *residual_grad_ptr,
+ T const *gamma_ptr,
+ ffStream_t stream);
+ static void
+ peft_bwd_kernel_wrapper(AddBiasResidualLayerNormMeta const *m,
+ GenericTensorAccessorR const &output_grad,
+ GenericTensorAccessorW &input_grad,
+ GenericTensorAccessorW const &residual_grad,
+ GenericTensorAccessorR const &gamma);
+
+public:
+ bool elementwise_affine, use_bias;
+ int64_t effective_batch_size, effective_num_elements;
+ float eps;
+ bool inplace_residual;
+ std::vector axes;
+};
+
+class AddBiasResidualLayerNormMeta : public OpMeta {
+public:
+ AddBiasResidualLayerNormMeta(FFHandler handle,
+ AddBiasResidualLayerNorm const *ln,
+ MemoryAllocator &gpu_mem_allocator);
+ ~AddBiasResidualLayerNormMeta(void);
+
+public:
+ bool elementwise_affine, use_bias;
+ int64_t effective_batch_size, effective_num_elements;
+ float eps;
+ bool inplace_residual;
+ void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr;
+ Realm::RegionInstance reserveInst;
+ // PEFT related fields
+ void *input_activation;
+ size_t allocated_peft_buffer_size = 0;
+};
+
+}; // namespace FlexFlow
diff --git a/include/flexflow/ops/add_bias_residual_layer_norm_params.h b/include/flexflow/ops/add_bias_residual_layer_norm_params.h
new file mode 100644
index 0000000000..840f521b01
--- /dev/null
+++ b/include/flexflow/ops/add_bias_residual_layer_norm_params.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include "flexflow/ffconst.h"
+#include "flexflow/fftype.h"
+#include "flexflow/parallel_tensor.h"
+
+namespace FlexFlow {
+
+struct AddBiasResidualLayerNormParams {
+ LayerID layer_guid;
+ std::vector axes;
+ bool elementwise_affine;
+ float eps;
+ bool use_bias;
+ bool inplace_residual;
+ char name[MAX_OPNAME];
+ bool is_valid(
+ std::pair const &) const;
+};
+
+bool operator==(AddBiasResidualLayerNormParams const &,
+ AddBiasResidualLayerNormParams const &);
+
+} // namespace FlexFlow
+
+namespace std {
+template <>
+struct hash {
+ size_t operator()(FlexFlow::AddBiasResidualLayerNormParams const &) const;
+};
+} // namespace std
diff --git a/include/flexflow/ops/aggregate.h b/include/flexflow/ops/aggregate.h
index 4eeb695e92..283e9a4290 100644
--- a/include/flexflow/ops/aggregate.h
+++ b/include/flexflow/ops/aggregate.h
@@ -1,6 +1,7 @@
#ifndef _FLEXFLOW_AGGREGATE_H_
#define _FLEXFLOW_AGGREGATE_H_
+#include "flexflow/inference.h"
#include "flexflow/model.h"
#include "flexflow/ops/aggregate_params.h"
@@ -8,11 +9,13 @@ namespace FlexFlow {
#define AGGREGATE_MAX_K 4
#define AGGREGATE_MAX_BATCH_SIZE 64
-#define AGGREGATE_MAX_N 12
+#define AGGREGATE_MAX_N 128
+
+class Aggregate;
class AggregateMeta : public OpMeta {
public:
- AggregateMeta(FFHandler handle, int n);
+ AggregateMeta(FFHandler handle, Aggregate const *aggr);
~AggregateMeta(void);
float **dev_exp_preds;
float **dev_exp_grads;
@@ -26,7 +29,7 @@ class Aggregate : public Op {
ParallelTensor const *inputs,
int _n,
float _lambda_bal,
- char const *name);
+ char const *name = nullptr);
Aggregate(FFModel &model,
Aggregate const &other,
std::vector const &inputs);
@@ -35,7 +38,16 @@ class Aggregate : public Op {
Input const &inputs,
char const *name = nullptr);
void init(FFModel const &) override;
+ void init_inference(FFModel const &,
+ std::vector const &,
+ std::vector const &,
+ MachineView const *mv = nullptr) override;
void forward(FFModel const &) override;
+ Legion::FutureMap inference(FFModel const &,
+ BatchConfigFuture const &,
+ std::vector const &,
+ std::vector const &,
+ MachineView const *mv = nullptr) override;
void backward(FFModel const &) override;
void print_layer(FFModel const &model) override {
assert(0);
@@ -81,6 +93,10 @@ class Aggregate : public Op {
int const batch_size,
int out_dim);
void serialize(Legion::Serializer &s) const override;
+ static PCG::Node deserialize(FFModel &ff,
+ Legion::Deserializer &d,
+ Input const &inputs,
+ int num_inputs);
bool measure_operator_cost(Simulator *sim,
MachineView const &mv,
CostMetrics &cost_metrics) const override;
diff --git a/include/flexflow/ops/aggregate_params.h b/include/flexflow/ops/aggregate_params.h
index f746881d89..deaa04b3e7 100644
--- a/include/flexflow/ops/aggregate_params.h
+++ b/include/flexflow/ops/aggregate_params.h
@@ -9,6 +9,7 @@ namespace FlexFlow {
struct AggregateParams {
int n;
float lambda_bal;
+ char name[MAX_OPNAME];
bool is_valid(std::vector const &) const;
};
bool operator==(AggregateParams const &, AggregateParams const &);
diff --git a/include/flexflow/ops/aggregate_spec.h b/include/flexflow/ops/aggregate_spec.h
index 8c1966e72a..a9f651b620 100644
--- a/include/flexflow/ops/aggregate_spec.h
+++ b/include/flexflow/ops/aggregate_spec.h
@@ -1,6 +1,7 @@
#ifndef _FLEXFLOW_AGGREGATE_SPEC_H_
#define _FLEXFLOW_AGGREGATE_SPEC_H_
+#include "flexflow/inference.h"
#include "flexflow/model.h"
#include "flexflow/ops/aggregate_spec_params.h"
@@ -10,9 +11,11 @@ namespace FlexFlow {
#define AGGREGATE_SPEC_MAX_BATCH_SIZE 32
#define AGGREGATE_SPEC_MAX_N 12
+class AggregateSpec;
+
class AggregateSpecMeta : public OpMeta {
public:
- AggregateSpecMeta(FFHandler handle, int n);
+ AggregateSpecMeta(FFHandler handle, AggregateSpec const *agg);
~AggregateSpecMeta(void);
float **dev_region_ptrs;
};
@@ -27,7 +30,16 @@ class AggregateSpec : public Op {
float _lambda_bal,
char const *name);
void init(FFModel const &) override;
+ void init_inference(FFModel const &,
+ std::vector const &,
+ std::vector const &,
+ MachineView const *mv = nullptr) override;
void forward(FFModel const &) override;
+ Legion::FutureMap inference(FFModel const &,
+ BatchConfigFuture const &,
+ std::vector const &,
+ std::vector const &,
+ MachineView const *mv = nullptr) override;
void backward(FFModel const &) override;
void print_layer(FFModel const &model) override {
assert(0);
diff --git a/include/flexflow/ops/aggregate_spec_params.h b/include/flexflow/ops/aggregate_spec_params.h
index eb662f4c07..69e8574cba 100644
--- a/include/flexflow/ops/aggregate_spec_params.h
+++ b/include/flexflow/ops/aggregate_spec_params.h
@@ -9,6 +9,7 @@ namespace FlexFlow {
struct AggregateSpecParams {
int n;
float lambda_bal;
+ char name[MAX_OPNAME];
bool is_valid(ParallelTensorShape const &) const;
};
bool operator==(AggregateSpecParams const &, AggregateSpecParams const &);
diff --git a/include/flexflow/ops/arg_topk.h b/include/flexflow/ops/arg_topk.h
new file mode 100644
index 0000000000..3822a5e41e
--- /dev/null
+++ b/include/flexflow/ops/arg_topk.h
@@ -0,0 +1,110 @@
+#ifndef _FLEXFLOW_ARG_TOPK_H_
+#define _FLEXFLOW_ARG_TOPK_H_
+
+#include "flexflow/inference.h"
+#include "flexflow/model.h"
+#include "flexflow/node.h"
+#include "flexflow/ops/arg_topk_params.h"
+
+namespace FlexFlow {
+
+class ArgTopKMeta : public OpMeta {
+public:
+ ArgTopKMeta(FFHandler handle, Op const *op);
+ bool sorted;
+ int k;
+ bool speculative_decoding;
+};
+
+class ArgTopK : public Op {
+public:
+ using Params = ArgTopKParams;
+ using Input = ParallelTensor;
+ ArgTopK(FFModel &model,
+ LayerID const &layer_guid,
+ const ParallelTensor input,
+ int k,
+ bool sorted,
+ bool speculative_decoding,
+ char const *name);
+ ArgTopK(FFModel &model,
+ LayerID const &layer_guid,
+ ArgTopK const &other,
+ const ParallelTensor input);
+ ArgTopK(FFModel &model,
+ Params const ¶ms,
+ Input const input,
+ char const *name = nullptr);
+ void init(FFModel const &) override;
+ void init_inference(FFModel const &,
+ std::vector const &,
+ std::vector const &,
+ MachineView const *mv = nullptr) override;
+ void forward(FFModel const &) override;
+ void backward(FFModel const &) override;
+ Legion::FutureMap inference(FFModel const &,
+ BatchConfigFuture const &,
+ std::vector const &,
+ std::vector const &,
+ MachineView const *mv = nullptr) override;
+ void print_layer(FFModel const &model) override {
+ assert(0);
+ }
+ static Op *
+ create_operator_from_layer(FFModel &model,
+ Layer const *layer,
+ std::vector const &inputs);
+
+ static OpMeta *init_task(Legion::Task const *task,
+ std::vector const ®ions,
+ Legion::Context ctx,
+ Legion::Runtime *runtime);
+ static InferenceResult
+ inference_task(Legion::Task const *task,
+ std::vector const ®ions,
+ Legion::Context ctx,
+ Legion::Runtime *runtime);
+ static BeamInferenceResult inference_speculative_task(
+ Legion::Task const *task,
+ std::vector const ®ions,
+ Legion::Context ctx,
+ Legion::Runtime *runtime);
+ void serialize(Legion::Serializer &s) const override;
+ static PCG::Node deserialize(FFModel &ff,
+ Legion::Deserializer &d,
+ ParallelTensor inputs[],
+ int num_inputs);
+ Op *materialize(FFModel &ff,
+ ParallelTensor inputs[],
+ int num_inputs) const override;
+ bool measure_operator_cost(Simulator *sim,
+ MachineView const &pc,
+ CostMetrics &cost_metrics) const override;
+ template
+ static void forward_kernel(ArgTopKMeta const *m,
+ DT const *input_ptr,
+ float *output_ptr,
+ int *indices_ptr,
+ size_t batch_size,
+ int length,
+ int k,
+ bool sorted,
+ BeamSearchBatchConfig const *bc,
+ ffStream_t stream);
+ static void forward_kernel_wrapper(ArgTopKMeta const *m,
+ GenericTensorAccessorR const &input,
+ GenericTensorAccessorW const &prob,
+ GenericTensorAccessorW const &indices,
+ int batch_size,
+ BeamSearchBatchConfig const *bc);
+ Params get_params() const;
+
+public:
+ int k;
+ bool sorted;
+ bool speculative_decoding;
+};
+
+}; // namespace FlexFlow
+
+#endif
diff --git a/include/flexflow/ops/arg_topk_params.h b/include/flexflow/ops/arg_topk_params.h
new file mode 100644
index 0000000000..b2876c011f
--- /dev/null
+++ b/include/flexflow/ops/arg_topk_params.h
@@ -0,0 +1,29 @@
+#ifndef _FLEXFLOW_ARG_TOPK_PARAMS_H
+#define _FLEXFLOW_ARG_TOPK_PARAMS_H
+
+#include "flexflow/ffconst.h"
+#include "flexflow/fftype.h"
+#include "flexflow/parallel_tensor.h"
+
+namespace FlexFlow {
+
+struct ArgTopKParams {
+ LayerID layer_guid;
+ int k;
+ bool sorted;
+ bool speculative_decoding;
+ char name[MAX_OPNAME];
+ bool is_valid(ParallelTensorShape const &) const;
+};
+bool operator==(ArgTopKParams const &, ArgTopKParams const &);
+
+} // namespace FlexFlow
+
+namespace std {
+template <>
+struct hash {
+ size_t operator()(FlexFlow::ArgTopKParams const &) const;
+};
+} // namespace std
+
+#endif // _FLEXFLOW_ARG_TOPK_PARAMS_H
diff --git a/include/flexflow/ops/argmax.h b/include/flexflow/ops/argmax.h
new file mode 100644
index 0000000000..eca9943d20
--- /dev/null
+++ b/include/flexflow/ops/argmax.h
@@ -0,0 +1,117 @@
+#ifndef _FLEXFLOW_ARG_MAX_H_
+#define _FLEXFLOW_ARG_MAX_H_
+
+#include "flexflow/inference.h"
+#include "flexflow/model.h"
+#include "flexflow/node.h"
+#include "flexflow/ops/argmax_params.h"
+#include "flexflow/utils/memory_allocator.h"
+
+namespace FlexFlow {
+
+class ArgMaxMeta : public OpMeta {
+public:
+ bool beam_search;
+ float *probs;
+ void *d_temp_storage;
+ size_t temp_storage_bytes = 0;
+ int *d_offsets;
+ void *d_out;
+ float *d_loss;
+ Realm::RegionInstance reserveInst;
+ ArgMaxMeta(FFHandler handler,
+ Op const *op,
+ Legion::Domain const &input_domain,
+ Legion::Domain const &output_domain,
+ GenericTensorAccessorW input,
+ int batch_size,
+ int total_ele,
+ MemoryAllocator &gpu_mem_allocator);
+ ~ArgMaxMeta(void);
+};
+
+class ArgMax : public Op {
+public:
+ using Params = ArgMaxParams;
+ using Input = ParallelTensor;
+ ArgMax(FFModel &model,
+ const ParallelTensor input,
+ bool beam_search,
+ char const *name);
+ ArgMax(FFModel &model, ArgMax const &other, const ParallelTensor input);
+ ArgMax(FFModel &model,
+ Params const ¶ms,
+ Input const input,
+ char const *name = nullptr);
+ void init(FFModel const &) override;
+ void init_inference(FFModel const &,
+ std::vector const &,
+ std::vector const &,
+ MachineView const *mv = nullptr) override;
+ void forward(FFModel const &) override;
+ void backward(FFModel const &) override;
+ Legion::FutureMap inference(FFModel const &,
+ BatchConfigFuture const &,
+ std::vector const &,
+ std::vector const &,
+ MachineView const *mv = nullptr) override;
+ void print_layer(FFModel const &model) override {
+ assert(0);
+ }
+ static Op *
+ create_operator_from_layer(FFModel &model,
+ Layer const *layer,
+ std::vector const &inputs);
+
+ static OpMeta *init_task(Legion::Task const *task,
+ std::vector const ®ions,
+ Legion::Context ctx,
+ Legion::Runtime *runtime);
+ static BeamInferenceResult
+ inference_task_beam(Legion::Task const *task,
+ std::vector const ®ions,
+ Legion::Context ctx,
+ Legion::Runtime *runtime);
+ static InferenceResult
+ inference_task_norm(Legion::Task const *task,
+ std::vector const ®ions,
+ Legion::Context ctx,
+ Legion::Runtime *runtime);
+ void serialize(Legion::Serializer &s) const override;
+ static PCG::Node deserialize(FFModel &ff,
+ Legion::Deserializer &d,
+ ParallelTensor inputs[],
+ int num_inputs);
+ Op *materialize(FFModel &ff,
+ ParallelTensor inputs[],
+ int num_inputs) const override;
+ bool measure_operator_cost(Simulator *sim,
+ MachineView const &pc,
+ CostMetrics &cost_metrics) const override;
+ template
+ static void forward_kernel(ArgMaxMeta const *m,
+ BatchConfig const *bc,
+ DT const *input_ptr,
+ int *indices_ptr,
+ float *prob_ptr,
+ int *parent_ptr,
+ int length,
+ int batch_size,
+ float *loss,
+ ffStream_t stream);
+ static void forward_kernel_wrapper(ArgMaxMeta const *m,
+ BatchConfig const *bc,
+ GenericTensorAccessorR const &input,
+ GenericTensorAccessorW const &indices,
+ GenericTensorAccessorW const &parent,
+ int batch_size,
+ float *loss);
+ Params get_params() const;
+
+public:
+ bool beam_search;
+};
+
+}; // namespace FlexFlow
+
+#endif
\ No newline at end of file
diff --git a/include/flexflow/ops/argmax_params.h b/include/flexflow/ops/argmax_params.h
new file mode 100644
index 0000000000..9ddb8e1fe3
--- /dev/null
+++ b/include/flexflow/ops/argmax_params.h
@@ -0,0 +1,25 @@
+#ifndef _FLEXFLOW_ARGMAX_PARAMS_H
+#define _FLEXFLOW_ARGMAX_PARAMS_H
+
+#include "flexflow/ffconst.h"
+#include "flexflow/parallel_tensor.h"
+
+namespace FlexFlow {
+
+struct ArgMaxParams {
+ bool beam_search;
+ bool is_valid(ParallelTensorShape const &) const;
+ char name[MAX_OPNAME];
+};
+bool operator==(ArgMaxParams const &, ArgMaxParams const &);
+
+} // namespace FlexFlow
+
+namespace std {
+template <>
+struct hash {
+ size_t operator()(FlexFlow::ArgMaxParams const &) const;
+};
+} // namespace std
+
+#endif // _FLEXFLOW_ARGMAX_PARAMS_H
\ No newline at end of file
diff --git a/include/flexflow/ops/attention.h b/include/flexflow/ops/attention.h
index 2903497af9..7f52e0dad4 100644
--- a/include/flexflow/ops/attention.h
+++ b/include/flexflow/ops/attention.h
@@ -3,6 +3,7 @@
#include "flexflow/device.h"
#include "flexflow/fftype.h"
+#include "flexflow/inference.h"
#include "flexflow/layer.h"
#include "flexflow/node.h"
#include "flexflow/op_meta.h"
@@ -64,8 +65,17 @@ class MultiHeadAttention : public Op {
Layer const *layer,
std::vector const &inputs);
void init(FFModel const &) override;
+ void init_inference(FFModel const &,
+ std::vector const &,
+ std::vector const &,
+ MachineView const *mv = nullptr) override;
void forward(FFModel const &) override;
void backward(FFModel const &) override;
+ Legion::FutureMap inference(FFModel const &,
+ BatchConfigFuture const &,
+ std::vector const &,
+ std::vector const &,
+ MachineView const *mv = nullptr) override;
void print_layer(FFModel const &model) override {
assert(0);
}
diff --git a/include/flexflow/ops/attention_params.h b/include/flexflow/ops/attention_params.h
index b72923a65c..89906407d3 100644
--- a/include/flexflow/ops/attention_params.h
+++ b/include/flexflow/ops/attention_params.h
@@ -11,6 +11,7 @@ struct MultiHeadAttentionParams {
int embed_dim, num_heads, kdim, vdim;
float dropout;
bool bias, add_bias_kv, add_zero_attn;
+ char name[MAX_OPNAME];
bool is_valid(std::tuple const &) const;
};
diff --git a/include/flexflow/ops/batch_norm.h b/include/flexflow/ops/batch_norm.h
index c923dc1097..01cc0e16ec 100644
--- a/include/flexflow/ops/batch_norm.h
+++ b/include/flexflow/ops/batch_norm.h
@@ -2,6 +2,7 @@
#define _FLEXFLOW_BATCH_NORM_H
#include "flexflow/model.h"
+#include "flexflow/utils/memory_allocator.h"
namespace FlexFlow {
diff --git a/include/flexflow/ops/beam_topk.h b/include/flexflow/ops/beam_topk.h
new file mode 100644
index 0000000000..9466ba2a3b
--- /dev/null
+++ b/include/flexflow/ops/beam_topk.h
@@ -0,0 +1,112 @@
+#ifndef _FLEXFLOW_BEAM_TOPK_H_
+#define _FLEXFLOW_BEAM_TOPK_H_
+
+#include "flexflow/inference.h"
+#include "flexflow/model.h"
+#include "flexflow/node.h"
+#include "flexflow/ops/beam_topk_params.h"
+#include "flexflow/utils/memory_allocator.h"
+
+namespace FlexFlow {
+
+class BeamTopKMeta : public OpMeta {
+public:
+ BeamTopKMeta(FFHandler handle,
+ Op const *op,
+ MemoryAllocator &gpu_mem_allocator);
+ ~BeamTopKMeta(void);
+ bool sorted;
+ int max_beam_width;
+ int *parent_ids;
+ void *acc_probs;
+ int *block_start_index;
+ int *request_id;
+ int *tokens_per_request;
+ Realm::RegionInstance reserveInst;
+};
+
+class BeamTopK : public Op {
+public:
+ using Params = BeamTopKParams;
+ using Input = ParallelTensor;
+ BeamTopK(FFModel &model,
+ const ParallelTensor input,
+ LayerID const &_layer_guid,
+ int max_beam_width,
+ bool sorted,
+ char const *name);
+ BeamTopK(FFModel &model, BeamTopK const &other, const ParallelTensor input);
+ BeamTopK(FFModel &model,
+ Params const ¶ms,
+ Input const input,
+ char const *name = nullptr);
+ void init(FFModel const &) override;
+ void init_inference(FFModel const &,
+ std::vector const &,
+ std::vector const &,
+ MachineView const *mv = nullptr) override;
+ void forward(FFModel const &) override;
+ void backward(FFModel const &) override;
+ Legion::FutureMap inference(FFModel const &,
+ BatchConfigFuture const &,
+ std::vector const &,
+ std::vector