From 345aeb95ed59ed05ca92e917109329073488e638 Mon Sep 17 00:00:00 2001 From: DerrickYLJ Date: Sun, 10 Sep 2023 14:06:53 -0400 Subject: [PATCH] added runner --- .github/workflows/build_legion.yml | 40 ++++++++++++++++++++++++++++ .github/workflows/gpu-ci.yml | 6 ++--- .github/workflows/multinode-test.yml | 6 ++--- docker/oracle_con.py | 38 ++++++++++++++++++++++++++ 4 files changed, 84 insertions(+), 6 deletions(-) create mode 100644 docker/oracle_con.py diff --git a/.github/workflows/build_legion.yml b/.github/workflows/build_legion.yml index 067d9d3c9c..e49303c8f4 100644 --- a/.github/workflows/build_legion.yml +++ b/.github/workflows/build_legion.yml @@ -14,6 +14,26 @@ concurrency: cancel-in-progress: true jobs: + start-oraicle-instance: + runs-on: ubuntu-latest + name: Turning on Oracle VM - flexflow-ci + env: + OCI_CLI_USER: ${{ secrets.OCI_CLI_USER }} + OCI_CLI_TENANCY: ${{ secrets.OCI_CLI_TENANCY }} + OCI_CLI_FINGERPRINT: ${{ secrets.OCI_CLI_FINGERPRINT }} + OCI_CLI_KEY_CONTENT: ${{ secrets.OCI_CLI_KEY_CONTENT }} + OCI_CLI_REGION: ${{ secrets.OCI_CLI_REGION }} + OCI_INSTANCE_ID: ${{ secrets.INSTANCE_ID_FFCI }} + steps: + - name: Checkout Git Repository + uses: actions/checkout@v3 + + - name: Install Oci + run: pip install oci + + - name: Running script (START) + run: python3 docker/oracle_con.py --start --instance_id $OCI_INSTANCE_ID + prebuild-legion: name: Prebuild Legion with CMake runs-on: [self-hosted, cpu_only] @@ -104,3 +124,23 @@ jobs: TAG_NAME: ${{ env.RELEASE_DATETIME }} GITHUB_TOKEN: ${{ secrets.FLEXFLOW_TOKEN }} run: gh release create $TAG_NAME ./unwrapped_artifacts/*.tar.gz --repo flexflow/flexflow-third-party + +close-oraicle-instance: + runs-on: ubuntu-latest + name: Turning on Oracle VM - flexflow-ci + env: + OCI_CLI_USER: ${{ secrets.OCI_CLI_USER }} + OCI_CLI_TENANCY: ${{ secrets.OCI_CLI_TENANCY }} + OCI_CLI_FINGERPRINT: ${{ secrets.OCI_CLI_FINGERPRINT }} + OCI_CLI_KEY_CONTENT: ${{ secrets.OCI_CLI_KEY_CONTENT }} + OCI_CLI_REGION: ${{ secrets.OCI_CLI_REGION }} + OCI_INSTANCE_ID: ${{ secrets.INSTANCE_ID_FFCI }} + steps: + - name: Checkout Git Repository + uses: actions/checkout@v3 + + - name: Install Oci + run: pip install oci + + - name: Running script (Close) + run: python3 docker/oracle_con.py --close --instance_id $OCI_INSTANCE_ID \ No newline at end of file diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index d604a7cea9..aee16832f3 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -61,7 +61,7 @@ jobs: python-interface-check: name: Check Python Interface - runs-on: self-hosted + runs-on: [self-hosted, gpu] defaults: run: shell: bash -l {0} # required to use an activated conda environment @@ -135,7 +135,7 @@ jobs: inference-tests: name: Inference Tests - runs-on: self-hosted + runs-on: [self-hosted, gpu] defaults: run: shell: bash -l {0} # required to use an activated conda environment @@ -210,7 +210,7 @@ jobs: gpu-ci-flexflow: name: Single Machine, Multiple GPUs Tests - runs-on: self-hosted + runs-on: [self-hosted, gpu] # skip this time-consuming test for PRs to the inference branch # if: ${{ github.event_name != 'pull_request' || github.base_ref != 'inference' }} defaults: diff --git a/.github/workflows/multinode-test.yml b/.github/workflows/multinode-test.yml index 37f81b615f..bd13adae75 100644 --- a/.github/workflows/multinode-test.yml +++ b/.github/workflows/multinode-test.yml @@ -32,7 +32,7 @@ jobs: name: Multinode GPU Test with MPI # Prevent Github from running the workflow on forks if: github.repository_owner == 'flexflow' - runs-on: self-hosted + runs-on: [self-hosted, gpu] needs: gpu-ci-concierge # 10h timeout, instead of default of 360min (6h) timeout-minutes: 600 @@ -83,7 +83,7 @@ jobs: name: Multinode GPU Test with UCX # Prevent Github from running the workflow on forks if: github.repository_owner == 'flexflow' - runs-on: self-hosted + runs-on: [self-hosted, gpu] needs: gpu-ci-concierge container: image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest @@ -134,7 +134,7 @@ jobs: name: Multinode GPU Test with native UCX # Prevent Github from running the workflow on forks if: github.repository_owner == 'flexflow' - runs-on: self-hosted + runs-on: [self-hosted, gpu] needs: gpu-ci-concierge container: image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest diff --git a/docker/oracle_con.py b/docker/oracle_con.py new file mode 100644 index 0000000000..4fd41930b7 --- /dev/null +++ b/docker/oracle_con.py @@ -0,0 +1,38 @@ +import oci +import argparse +import os + +parser = argparse.ArgumentParser(description="Program with optional flags") +group = parser.add_mutually_exclusive_group() +group.add_argument("--start", action="store_true", help="Start action") +group.add_argument("--stop", action="store_true", help="Stop action") +parser.add_argument("--instance_id", type=str, required=True, help="instance id required") +args = parser.parse_args() + +oci_key_content = os.getenv("OCI_CLI_KEY_CONTENT") + +config = { + "user": os.getenv("OCI_CLI_USER"), + "key_content": os.getenv("OCI_CLI_KEY_CONTENT"), + "fingerprint": os.getenv("OCI_CLI_FINGERPRINT"), + "tenancy": os.getenv("OCI_CLI_TENANCY"), + "region": os.getenv("OCI_CLI_REGION") +} + +# Initialize the OCI configuration +# config = oci.config.from_file() +oci.config.validate_config(config) + +# Initialize the ComputeClient to interact with VM instances +compute = oci.core.ComputeClient(config) + +# Replace 'your_instance_id' with the actual instance ID of your VM +instance_id = args.instance_id + +# Perform the action +if args.start: + # Start the VM + compute.instance_action(instance_id, "START") +else: + # Stop the VM + compute.instance_action(instance_id, "STOP")