[AllReduce] make AllReduce tasks concurrent in FlexFlow (#1517) #3222

Workflow file for this run

	name: "gpu-ci"
	on:
	schedule:
	- cron: "0 0 1,14,28 * *" # At 00:00 on day-of-month 1, 14, and 28.
	push:
	branches:
	- "inference"
	paths:
	- "cmake/**"
	- "config/**"
	- "deps/**"
	- "python/**"
	- "setup.py"
	- "include/**"
	- "inference/**"
	- "src/**"
	- "tests/inference/**"
	- "conda/flexflow.yml"
	- ".github/workflows/gpu-ci.yml"
	- "tests/cpp_gpu_tests.sh"
	- "tests/inference_tests.sh"
	- "tests/training_tests.sh"
	- "tests/python_interface_test.sh"
	workflow_dispatch:

	concurrency:
	group: gpu-ci-${{ github.head_ref \|\| github.run_id }}
	cancel-in-progress: true

	jobs:
	gpu-ci-concierge:
	name: GPU CI Concierge
	runs-on: ubuntu-20.04
	env:
	FLEXFLOW_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	steps:
	- name: Checkout Git Repository
	uses: actions/checkout@v3

	- name: Wait for daemon to be done
	run: \|
	pip3 install pip --upgrade
	pip3 install pyopenssl --upgrade
	pip3 install urllib3 --upgrade
	pip3 install pygithub
	python3 .github/workflows/helpers/gpu_ci_helper.py

	keep-runner-registered:
	name: Keep runner alive
	if: ${{ github.event_name == 'schedule' }}
	runs-on: [self-hosted, gpu]
	defaults:
	run:
	shell: bash -l {0} # required to use an activated conda environment
	env:
	CONDA: "3"
	needs: gpu-ci-concierge
	container:
	image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
	options: --gpus all --shm-size=8192m
	steps:
	- name: Keep alive
	run: \|
	echo "Keep self-hosted runner registered with Github"
	sleep 10m

	python-interface-check:
	name: Check Python Interface
	if: ${{ github.event_name != 'schedule' }}
	runs-on: [self-hosted, gpu]
	defaults:
	run:
	shell: bash -l {0} # required to use an activated conda environment
	env:
	CONDA: "3"
	needs: gpu-ci-concierge
	container:
	image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
	options: --gpus all --shm-size=8192m
	steps:
	- name: Install updated git version
	run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git

	- name: Checkout Git Repository
	uses: actions/checkout@v3
	with:
	submodules: recursive

	- name: Install conda and FlexFlow dependencies
	uses: conda-incubator/setup-miniconda@v2
	with:
	miniconda-version: "latest"
	activate-environment: flexflow
	environment-file: conda/flexflow.yml
	auto-activate-base: false
	auto-update-conda: false

	- name: Install conda and Pytorch dependencies for pytorch alignment test
	run: \|
	conda env create -f conda/pytorch-gpu.yml

	- name: Build FlexFlow
	run: \|
	export PATH=$CONDA_PREFIX/bin:$PATH
	export FF_HOME=$(pwd)
	export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
	mkdir build
	cd build
	../config/config.linux
	make -j

	- name: Check FlexFlow Python interface (before installation)
	run: \|
	export PATH=$CONDA_PREFIX/bin:$PATH
	export FF_HOME=$(pwd)
	export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
	./tests/python_interface_test.sh before-installation

	- name: Install FlexFlow
	run: \|
	export PATH=$CONDA_PREFIX/bin:$PATH
	export FF_HOME=$(pwd)
	export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
	cd build
	../config/config.linux
	make install
	ldconfig

	- name: Check FlexFlow Python interface (after installation)
	run: \|
	export PATH=$CONDA_PREFIX/bin:$PATH
	export FF_HOME=$(pwd)
	export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
	./tests/python_interface_test.sh after-installation

	- name: Run flexflow alignment with pytorch
	run: \|
	# run alingment tests
	export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
	./tests/align/test_all_operators.sh

	inference-tests:
	name: Inference Tests
	if: ${{ github.event_name != 'schedule' }}
	runs-on: [self-hosted, gpu]
	defaults:
	run:
	shell: bash -l {0} # required to use an activated conda environment
	env:
	CONDA: "3"
	HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }}
	needs: gpu-ci-concierge
	container:
	image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
	options: --gpus all --shm-size=8192m
	steps:
	- name: Install updated git version
	run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git

	- name: Checkout Git Repository
	uses: actions/checkout@v3
	with:
	submodules: recursive

	- name: Install conda and FlexFlow dependencies
	uses: conda-incubator/setup-miniconda@v2
	with:
	miniconda-version: "latest"
	activate-environment: flexflow
	environment-file: conda/flexflow.yml
	auto-activate-base: false

	- name: Build FlexFlow
	run: \|
	export PATH=$CONDA_PREFIX/bin:$PATH
	export FF_HOME=$(pwd)
	export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
	export FF_BUILD_INFERENCE=ON
	mkdir build
	cd build
	../config/config.linux
	make -j

	- name: Run PEFT tests
	run: \|
	export PATH=$CONDA_PREFIX/bin:$PATH
	export CUDNN_DIR=/usr/local/cuda
	export CUDA_DIR=/usr/local/cuda
	export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib

	source ./build/set_python_envs.sh
	./tests/peft_test.sh

	- name: Run inference tests
	env:
	CPP_INFERENCE_TESTS: ${{ vars.CPP_INFERENCE_TESTS }}
	run: \|
	export PATH=$CONDA_PREFIX/bin:$PATH
	export FF_HOME=$(pwd)
	export CUDNN_DIR=/usr/local/cuda
	export CUDA_DIR=/usr/local/cuda
	export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib

	# GPT tokenizer test
	# ./tests/gpt_tokenizer_test.sh

	# Inference tests
	source ./build/set_python_envs.sh
	./tests/inference_tests.sh

	- name: Save inference output as an artifact
	if: always()
	run: \|
	cd inference
	tar -zcvf output.tar.gz ./output

	- name: Upload artifact
	uses: actions/upload-artifact@v3
	if: always()
	with:
	name: output
	path: inference/output.tar.gz

	# Github persists the .cache folder across different runs/containers
	- name: Clear cache
	if: always()
	run: sudo rm -rf ~/.cache

	training-tests:
	name: Training Tests
	if: ${{ github.event_name != 'schedule' }}
	runs-on: [self-hosted, gpu]
	# skip this time-consuming test for PRs to the inference branch
	# if: ${{ github.event_name != 'pull_request' \|\| github.base_ref != 'inference' }}
	defaults:
	run:
	shell: bash -l {0} # required to use an activated conda environment
	env:
	CONDA: "3"
	needs: inference-tests
	container:
	image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
	options: --gpus all --shm-size=8192m
	steps:
	- name: Install updated git version
	run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git

	- name: Checkout Git Repository
	uses: actions/checkout@v3
	with:
	submodules: recursive

	- name: Install conda and FlexFlow dependencies
	uses: conda-incubator/setup-miniconda@v2
	with:
	miniconda-version: "latest"
	activate-environment: flexflow
	environment-file: conda/flexflow.yml
	auto-activate-base: false

	- name: Build and Install FlexFlow
	run: \|
	export PATH=$CONDA_PREFIX/bin:$PATH
	export FF_HOME=$(pwd)
	export FF_BUILD_TRAINING_EXAMPLES=ON
	export FF_BUILD_INFERENCE=ON
	export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
	pip install . --verbose

	- name: Check FlexFlow Python interface (pip)
	run: \|
	export PATH=$CONDA_PREFIX/bin:$PATH
	export FF_HOME=$(pwd)
	export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
	./tests/python_interface_test.sh after-installation

	- name: Run multi-gpu tests
	run: \|
	export PATH=$CONDA_PREFIX/bin:$PATH
	export CUDNN_DIR=/usr/local/cuda
	export CUDA_DIR=/usr/local/cuda
	export FF_HOME=$(pwd)
	export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
	# C++ tests
	./tests/cpp_gpu_tests.sh 4
	# Python tests
	./tests/training_tests.sh 4

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AllReduce] make AllReduce tasks concurrent in FlexFlow (#1517) #3222

Workflow file

[AllReduce] make AllReduce tasks concurrent in FlexFlow (#1517) #3222

Jobs

Run details

Workflow file for this run