Skip to content

Commit

Permalink
Merge branch 'inference' into new_falcon
Browse files Browse the repository at this point in the history
  • Loading branch information
goliaro authored Apr 6, 2024
2 parents 8c913a5 + 1210256 commit 51de69e
Show file tree
Hide file tree
Showing 152 changed files with 3,124 additions and 384 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/docker-build-skip.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
cuda_version: ["11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8", "12.0"]
cuda_version: ["11.1", "11.6", "11.7", "11.8", "12.0", "12.1", "12.2"]
fail-fast: false
steps:
- run: 'echo "No docker-build required"'
12 changes: 6 additions & 6 deletions .github/workflows/docker-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -103,27 +103,27 @@ jobs:
runs-on: ubuntu-20.04
strategy:
matrix:
cuda_version: ["11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8", "12.0"]
cuda_version: ["11.1", "11.6", "11.7", "11.8", "12.0", "12.1", "12.2"]
fail-fast: false
env:
FF_GPU_BACKEND: "cuda"
cuda_version: ${{ matrix.cuda_version }}
steps:
- name: Checkout Git Repository
if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '11.8' }}
if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
uses: actions/checkout@v3
with:
submodules: recursive

- name: Free additional space on runner
if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '11.8' }}
if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
run: .github/workflows/helpers/free_space_on_runner.sh

- name: Build Docker container
if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '11.8' }}
if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
env:
deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
build_needed: ${{ matrix.cuda_version == '11.8' }}
build_needed: ${{ matrix.cuda_version == '12.0' }}
run: |
# On push to inference, build for all compatible architectures, so that we can publish
# a pre-built general-purpose image. On all other cases, only build for one architecture
Expand All @@ -137,7 +137,7 @@ jobs:
fi
- name: Check availability of flexflow modules in Python
if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '11.8' }}
if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${cuda_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"

- name: Publish Docker environment image (on push to inference)
Expand Down
33 changes: 8 additions & 25 deletions .github/workflows/gpu-ci.yml
Original file line number Diff line number Diff line change
@@ -1,25 +1,8 @@
name: "gpu-ci"
on:
pull_request:
paths:
- "cmake/**"
- "config/**"
- "deps/**"
- "python/**"
- "setup.py"
- "include/**"
- "inference/**"
- "src/**"
- "tests/inference/**"
- "conda/flexflow.yml"
- ".github/workflows/gpu-ci.yml"
- "tests/cpp_gpu_tests.sh"
- "tests/inference_tests.sh"
- "tests/training_tests.sh"
- "tests/python_interface_test.sh"
push:
branches:
- "master"
- "inference"
paths:
- "cmake/**"
- "config/**"
Expand Down Expand Up @@ -194,7 +177,7 @@ jobs:
- name: Save inference output as an artifact
if: always()
run: |
run: |
cd inference
tar -zcvf output.tar.gz ./output
Expand Down Expand Up @@ -222,7 +205,7 @@ jobs:
CONDA: "3"
needs: inference-tests
container:
image: ghcr.io/flexflow/flexflow-environment-cuda:latest
image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
options: --gpus all --shm-size=8192m
steps:
- name: Install updated git version
Expand All @@ -243,7 +226,7 @@ jobs:

- name: Build and Install FlexFlow
run: |
export PATH=/opt/conda/bin:$PATH
export PATH=$CONDA_PREFIX/bin:$PATH
export FF_HOME=$(pwd)
export FF_BUILD_ALL_EXAMPLES=ON
export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
Expand All @@ -252,18 +235,18 @@ jobs:
- name: Check FlexFlow Python interface (pip)
run: |
export PATH=/opt/conda/bin:$PATH
export PATH=$CONDA_PREFIX/bin:$PATH
export FF_HOME=$(pwd)
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
./tests/python_interface_test.sh after-installation
- name: Run multi-gpu tests
run: |
export PATH=/opt/conda/bin:$PATH
export PATH=$CONDA_PREFIX/bin:$PATH
export CUDNN_DIR=/usr/local/cuda
export CUDA_DIR=/usr/local/cuda
export FF_HOME=$(pwd)
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
# C++ tests
./tests/cpp_gpu_tests.sh 4
# Python tests
Expand Down
8 changes: 5 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,7 @@ if(NOT BUILD_LEGION_ONLY)

# python related
if (FF_USE_PYTHON)
find_package(Python COMPONENTS Interpreter Development)
# create flexflow_cffi_header.py
add_custom_command(TARGET flexflow
PRE_BUILD
Expand All @@ -424,13 +425,13 @@ if(NOT BUILD_LEGION_ONLY)
# generate the Legion Python bindings library. When building from pip, we need to do this post-install to prevent Legion from overwriting the path to the Legion shared library
add_custom_command(TARGET flexflow
POST_BUILD
COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS}
COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python
)
# create flexflow_python interpreter. When building from pip, we install the FF_HOME/python/flexflow_python script instead.
add_custom_command(TARGET flexflow
PRE_BUILD
COMMAND ${PYTHON_EXECUTABLE} ${FLEXFLOW_ROOT}/python/flexflow_python_build.py --build-dir ${CMAKE_BINARY_DIR}
COMMAND ${Python_EXECUTABLE} ${FLEXFLOW_ROOT}/python/flexflow_python_build.py --build-dir ${CMAKE_BINARY_DIR}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMENT "Creating flexflow_python interpreter..."
)
Expand Down Expand Up @@ -567,7 +568,8 @@ if(NOT BUILD_LEGION_ONLY)
install(TARGETS flexflow DESTINATION ${LIB_DEST})
# install python
if (FF_USE_PYTHON)
execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)
find_package(Python COMPONENTS Interpreter Development)
execute_process(COMMAND ${Python_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)
if (NOT FF_BUILD_FROM_PYPI)
install(
DIRECTORY ${FLEXFLOW_ROOT}/python/flexflow/
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ If you run into any issue during the install, or if you would like to use the C+
docker run --gpus all -it --rm --shm-size=8g ghcr.io/flexflow/flexflow-cuda-12.0:latest
```

To download a Docker container for a backend other than CUDA v12.0, you can replace the `cuda-12.0` suffix with any of the following backends: `cuda-11.1`, `cuda-11.2`, `cuda-11.3`, `cuda-11.4`, `cuda-11.5`, `cuda-11.6`, `cuda-11.7`, `cuda-11.8`, and `hip_rocm-5.3`, `hip_rocm-5.4`, `hip_rocm-5.5`, `hip_rocm-5.6`). More info on the Docker images, with instructions to build a new image from source, or run with additional configurations, can be found [here](./docker/README.md).
To download a Docker container for a backend other than CUDA v12.0, you can replace the `cuda-12.0` suffix with any of the following backends: `cuda-11.1`, `cuda-11.6`, `cuda-11.7`, `cuda-11.8`, `cuda-12.0`, `cuda-12.1`, `cuda-12.1`, and `hip_rocm-5.3`, `hip_rocm-5.4`, `hip_rocm-5.5`, `hip_rocm-5.6`. More info on the Docker images, with instructions to build a new image from source, or run with additional configurations, can be found [here](./docker/README.md).

### Build from source

Expand Down
72 changes: 69 additions & 3 deletions SERVE.md
Original file line number Diff line number Diff line change
Expand Up @@ -182,14 +182,80 @@ FlexFlow Serve supports int4 and int8 quantization. The compressed tensors are s
### Prompt Datasets
We provide five prompt datasets for evaluating FlexFlow Serve: [Chatbot instruction prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatbot.json), [ChatGPT Prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatgpt.json), [WebQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/webqa.json), [Alpaca](https://specinfer.s3.us-east-2.amazonaws.com/prompts/alpaca.json), and [PIQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/piqa.json).




## Python Interface Features and Interaction Methods

FlexFlow Serve provides a comprehensive Python interface for serving with low latency and high performance. This interface facilitates the deployment and interaction with the serving platform for a variety of applications, from chatbots and prompt templates to retrieval augmented generation and API services.

### Chatbot with Gradio

The Python interface allows setting up a chatbot application using Gradio, enabling interactive dialogues with users through a user-friendly web interface.

#### Implementation Steps
1. **FlexFlow Initialization:** Configure and initialize FlexFlow Serve with the desired settings and the specific LLM.
```python
import gradio as gr
import flexflow.serve as ff

ff.init(num_gpus=2, memory_per_gpu=14000, ...)
```
2. **Gradio Interface Setup:** Implement a function to generate responses from user inputs and set up the Gradio Chat Interface for interaction.
```python
def generate_response(user_input):
result = llm.generate(user_input)
return result.output_text.decode('utf-8')
```
3. **Running the Interface:** Launch the Gradio interface to interact with the LLM through a web-based chat interface.
```python
iface = gr.ChatInterface(fn=generate_response)
iface.launch()
```
4. **Shutdown:** Properly stop the FlexFlow server after interaction is complete.



### Langchain Usecases
FlexFlow Serve supports langchain usecases including dynamic prompt template handling and RAG usecases, enabling the customization of model responses based on structured input templates and Retrieval Augmented Generation.

#### Implementation Steps
1. **FlexFlow Initialization**: Start by initializing FlexFlow Serve with the appropriate configurations.
2. **LLM Setup**: Compile and load the LLM for text generation.
3. **Prompt Template/RAG Setup**: Configure prompt templates to guide the model's responses.
4. **Response Generation**: Use the LLM with the prompt template to generate responses.


### Python FastAPI Entrypoint
Flexflow Serve also supports deploying and managing LLMs with FastAPI, offering a RESTful API interface for generating responses from models.

```python
@app.on_event("startup")
async def startup_event():
global llm
# Initialize and compile the LLM model
llm.compile(
generation_config,
# ... other params as needed
)
llm.start_server()

@app.post("/generate/")
async def generate(prompt_request: PromptRequest):
# ... exception handling
full_output = llm.generate([prompt_request.prompt])[0].output_text.decode('utf-8')
# ... split prompt and response text for returning results
return {"prompt": prompt_request.prompt, "response": full_output}
```




## TODOs

FlexFlow Serve is still under active development. We currently focus on the following tasks and strongly welcome all contributions from bug fixes to new features and extensions.

* AMD benchmarking. We are actively working on benchmarking FlexFlow Serve on AMD GPUs and comparing it with the performance on NVIDIA GPUs.
* Chatbot prompt templates and Multi-round conversations
* Support for FastAPI server
* Integration with LangChain for document question answering

## Acknowledgements
This project is initiated by members from CMU, Stanford, and UCSD. We will be continuing developing and supporting FlexFlow Serve. Please cite FlexFlow Serve as:
Expand Down
4 changes: 2 additions & 2 deletions cmake/pip_install/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# Use setup.py script to re-install the Python bindings library with the right library paths
if (FF_USE_PYTHON)
execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)
execute_process(COMMAND ${Python_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)
if(FF_BUILD_FROM_PYPI)
install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E echo \"Editing path to Legion library using path: ${PY_DEST}/flexflow/lib \")")
# CMAKE_CURRENT_SOURCE_DIR=/usr/FlexFlow/cmake/pip_install
# Legion_BINARY_DIR=/usr/FlexFlow/build/<something>/deps/legion
install(CODE "execute_process(COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/setup.py install --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${PY_DEST}/flexflow ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python)")
install(CODE "execute_process(COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/setup.py install --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${PY_DEST}/flexflow ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python)")
endif()
endif()
2 changes: 1 addition & 1 deletion deps/legion
Submodule legion updated from 626b55 to 24e8c4
Loading

0 comments on commit 51de69e

Please sign in to comment.