Merge branch 'inference' into new_falcon

flexflow · Apr 6, 2024 · 51de69e · 51de69e
2 parents 8c913a5 + 1210256
commit 51de69e
Show file tree

Hide file tree

Showing 152 changed files with 3,124 additions and 384 deletions.
diff --git a/.github/workflows/docker-build-skip.yml b/.github/workflows/docker-build-skip.yml
@@ -28,7 +28,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        cuda_version: ["11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8", "12.0"]
+        cuda_version: ["11.1", "11.6", "11.7", "11.8", "12.0", "12.1", "12.2"]
       fail-fast: false
     steps:
       - run: 'echo "No docker-build required"'
diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
@@ -103,27 +103,27 @@ jobs:
     runs-on: ubuntu-20.04
     strategy:
       matrix:
-        cuda_version: ["11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8", "12.0"]
+        cuda_version: ["11.1", "11.6", "11.7", "11.8", "12.0", "12.1", "12.2"]
       fail-fast: false
     env:
       FF_GPU_BACKEND: "cuda"
       cuda_version: ${{ matrix.cuda_version }}
     steps:
       - name: Checkout Git Repository
-        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '11.8' }}
+        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
         uses: actions/checkout@v3
         with:
           submodules: recursive
 
       - name: Free additional space on runner
-        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '11.8' }}
+        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
         run: .github/workflows/helpers/free_space_on_runner.sh
 
       - name: Build Docker container
-        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '11.8' }}
+        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
         env:
           deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
-          build_needed: ${{ matrix.cuda_version == '11.8' }}
+          build_needed: ${{ matrix.cuda_version == '12.0' }}
         run: |
           # On push to inference, build for all compatible architectures, so that we can publish 
           # a pre-built general-purpose image. On all other cases, only build for one architecture
@@ -137,7 +137,7 @@ jobs:
           fi
 
       - name: Check availability of flexflow modules in Python
-        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '11.8' }}
+        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
         run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${cuda_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
 
       - name: Publish Docker environment image (on push to inference)

diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
@@ -1,25 +1,8 @@
 name: "gpu-ci"
 on:
-  pull_request:
-    paths:
-      - "cmake/**"
-      - "config/**"
-      - "deps/**"
-      - "python/**"
-      - "setup.py"
-      - "include/**"
-      - "inference/**"
-      - "src/**"
-      - "tests/inference/**"
-      - "conda/flexflow.yml"
-      - ".github/workflows/gpu-ci.yml"
-      - "tests/cpp_gpu_tests.sh"
-      - "tests/inference_tests.sh"
-      - "tests/training_tests.sh"
-      - "tests/python_interface_test.sh"
   push:
     branches:
-      - "master"
+      - "inference"
     paths:
       - "cmake/**"
       - "config/**"
@@ -194,7 +177,7 @@ jobs:
       
       - name: Save inference output as an artifact
         if: always()
-        run: | 
+        run: |
           cd inference
           tar -zcvf output.tar.gz ./output
 
@@ -222,7 +205,7 @@ jobs:
       CONDA: "3"
     needs: inference-tests
     container:
-      image: ghcr.io/flexflow/flexflow-environment-cuda:latest
+      image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
       options: --gpus all --shm-size=8192m
     steps:
       - name: Install updated git version
@@ -243,7 +226,7 @@ jobs:
 
       - name: Build and Install FlexFlow
         run: |
-          export PATH=/opt/conda/bin:$PATH
+          export PATH=$CONDA_PREFIX/bin:$PATH
           export FF_HOME=$(pwd)
           export FF_BUILD_ALL_EXAMPLES=ON
           export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
@@ -252,18 +235,18 @@ jobs:
 
       - name: Check FlexFlow Python interface (pip)
         run: |
-          export PATH=/opt/conda/bin:$PATH
+          export PATH=$CONDA_PREFIX/bin:$PATH
           export FF_HOME=$(pwd)
-          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib
+          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
           ./tests/python_interface_test.sh after-installation
 
       - name: Run multi-gpu tests
         run: |
-          export PATH=/opt/conda/bin:$PATH
+          export PATH=$CONDA_PREFIX/bin:$PATH
           export CUDNN_DIR=/usr/local/cuda
           export CUDA_DIR=/usr/local/cuda
           export FF_HOME=$(pwd)
-          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib
+          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
           # C++ tests
           ./tests/cpp_gpu_tests.sh 4
           # Python tests

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -413,6 +413,7 @@ if(NOT BUILD_LEGION_ONLY)
 
   # python related
   if (FF_USE_PYTHON)
+    find_package(Python COMPONENTS Interpreter Development)
     # create flexflow_cffi_header.py
     add_custom_command(TARGET flexflow
       PRE_BUILD	
@@ -424,13 +425,13 @@ if(NOT BUILD_LEGION_ONLY)
       # generate the Legion Python bindings library. When building from pip, we need to do this post-install to prevent Legion from overwriting the path to the Legion shared library
       add_custom_command(TARGET flexflow
         POST_BUILD	
-        COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS}
+        COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS}
         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python
       )
       # create flexflow_python interpreter. When building from pip, we install the FF_HOME/python/flexflow_python script instead.
       add_custom_command(TARGET flexflow
         PRE_BUILD	
-        COMMAND ${PYTHON_EXECUTABLE} ${FLEXFLOW_ROOT}/python/flexflow_python_build.py --build-dir ${CMAKE_BINARY_DIR}
+        COMMAND ${Python_EXECUTABLE} ${FLEXFLOW_ROOT}/python/flexflow_python_build.py --build-dir ${CMAKE_BINARY_DIR}
         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
         COMMENT "Creating flexflow_python interpreter..."
       )
@@ -567,7 +568,8 @@ if(NOT BUILD_LEGION_ONLY)
   install(TARGETS flexflow DESTINATION ${LIB_DEST})
   # install python
   if (FF_USE_PYTHON)
-    execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)
+    find_package(Python COMPONENTS Interpreter Development)
+    execute_process(COMMAND ${Python_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)
     if (NOT FF_BUILD_FROM_PYPI)
       install(
         DIRECTORY ${FLEXFLOW_ROOT}/python/flexflow/

diff --git a/README.md b/README.md
@@ -35,7 +35,7 @@ If you run into any issue during the install, or if you would like to use the C+
 docker run --gpus all -it --rm --shm-size=8g ghcr.io/flexflow/flexflow-cuda-12.0:latest
 ```
 
-To download a Docker container for a backend other than CUDA v12.0, you can replace the `cuda-12.0` suffix with any of the following backends: `cuda-11.1`, `cuda-11.2`, `cuda-11.3`, `cuda-11.4`, `cuda-11.5`, `cuda-11.6`, `cuda-11.7`, `cuda-11.8`, and `hip_rocm-5.3`, `hip_rocm-5.4`, `hip_rocm-5.5`, `hip_rocm-5.6`). More info on the Docker images, with instructions to build a new image from source, or run with additional configurations, can be found [here](./docker/README.md).
+To download a Docker container for a backend other than CUDA v12.0, you can replace the `cuda-12.0` suffix with any of the following backends: `cuda-11.1`, `cuda-11.6`, `cuda-11.7`, `cuda-11.8`, `cuda-12.0`, `cuda-12.1`, `cuda-12.1`, and `hip_rocm-5.3`, `hip_rocm-5.4`, `hip_rocm-5.5`, `hip_rocm-5.6`. More info on the Docker images, with instructions to build a new image from source, or run with additional configurations, can be found [here](./docker/README.md).
 
 ### Build from source
 

diff --git a/SERVE.md b/SERVE.md
@@ -182,14 +182,80 @@ FlexFlow Serve supports int4 and int8 quantization. The compressed tensors are s
 ### Prompt Datasets
 We provide five prompt datasets for evaluating FlexFlow Serve: [Chatbot instruction prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatbot.json), [ChatGPT Prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatgpt.json), [WebQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/webqa.json), [Alpaca](https://specinfer.s3.us-east-2.amazonaws.com/prompts/alpaca.json), and [PIQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/piqa.json).
 
+
+
+
+## Python Interface Features and Interaction Methods
+
+FlexFlow Serve provides a comprehensive Python interface for serving with low latency and high performance. This interface facilitates the deployment and interaction with the serving platform for a variety of applications, from chatbots and prompt templates to retrieval augmented generation and API services.
+
+### Chatbot with Gradio
+
+The Python interface allows setting up a chatbot application using Gradio, enabling interactive dialogues with users through a user-friendly web interface.
+
+#### Implementation Steps
+1. **FlexFlow Initialization:** Configure and initialize FlexFlow Serve with the desired settings and the specific LLM.
+```python
+import gradio as gr
+import flexflow.serve as ff
+
+ff.init(num_gpus=2, memory_per_gpu=14000, ...)
+```
+2. **Gradio Interface Setup:** Implement a function to generate responses from user inputs and set up the Gradio Chat Interface for interaction.
+```python
+def generate_response(user_input):
+    result = llm.generate(user_input)
+    return result.output_text.decode('utf-8')
+```
+3. **Running the Interface:** Launch the Gradio interface to interact with the LLM through a web-based chat interface.
+```python
+iface = gr.ChatInterface(fn=generate_response)
+iface.launch()
+```
+4. **Shutdown:** Properly stop the FlexFlow server after interaction is complete.
+
+
+
+### Langchain Usecases
+FlexFlow Serve supports langchain usecases including dynamic prompt template handling and RAG usecases, enabling the customization of model responses based on structured input templates and Retrieval Augmented Generation.
+
+#### Implementation Steps
+1. **FlexFlow Initialization**: Start by initializing FlexFlow Serve with the appropriate configurations.
+2. **LLM Setup**: Compile and load the LLM for text generation.
+3. **Prompt Template/RAG Setup**: Configure prompt templates to guide the model's responses.
+4. **Response Generation**: Use the LLM with the prompt template to generate responses.
+
+
+### Python FastAPI Entrypoint
+Flexflow Serve also supports deploying and managing LLMs with FastAPI, offering a RESTful API interface for generating responses from models.
+
+```python
+@app.on_event("startup")
+async def startup_event():
+   global llm
+   # Initialize and compile the LLM model
+   llm.compile(
+      generation_config,
+      # ... other params as needed
+   )
+   llm.start_server()
+
+@app.post("/generate/")
+async def generate(prompt_request: PromptRequest):
+   # ... exception handling
+   full_output = llm.generate([prompt_request.prompt])[0].output_text.decode('utf-8')
+   # ... split prompt and response text for returning results
+   return {"prompt": prompt_request.prompt, "response": full_output}
+```
+
+
+
+
 ## TODOs
 
 FlexFlow Serve is still under active development. We currently focus on the following tasks and strongly welcome all contributions from bug fixes to new features and extensions.
 
 * AMD benchmarking. We are actively working on benchmarking FlexFlow Serve on AMD GPUs and comparing it with the performance on NVIDIA GPUs.
-* Chatbot prompt templates and Multi-round conversations
-* Support for FastAPI server
-* Integration with LangChain for document question answering
 
 ## Acknowledgements
 This project is initiated by members from CMU, Stanford, and UCSD. We will be continuing developing and supporting FlexFlow Serve. Please cite FlexFlow Serve as:

diff --git a/cmake/pip_install/CMakeLists.txt b/cmake/pip_install/CMakeLists.txt
@@ -1,10 +1,10 @@
 # Use setup.py script to re-install the Python bindings library with the right library paths
 if (FF_USE_PYTHON)
-    execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)  
+    execute_process(COMMAND ${Python_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)  
     if(FF_BUILD_FROM_PYPI)
         install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E echo \"Editing path to Legion library using path: ${PY_DEST}/flexflow/lib \")")
         # CMAKE_CURRENT_SOURCE_DIR=/usr/FlexFlow/cmake/pip_install 
         # Legion_BINARY_DIR=/usr/FlexFlow/build/<something>/deps/legion
-        install(CODE "execute_process(COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/setup.py install --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${PY_DEST}/flexflow ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python)")
+        install(CODE "execute_process(COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/setup.py install --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${PY_DEST}/flexflow ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python)")
     endif()
 endif()
diff --git a/deps/legion b/deps/legion