From 345aeb95ed59ed05ca92e917109329073488e638 Mon Sep 17 00:00:00 2001
From: DerrickYLJ <lijiey@andrew.cmu.edu>
Date: Sun, 10 Sep 2023 14:06:53 -0400
Subject: [PATCH] added runner

---
 .github/workflows/build_legion.yml   | 40 ++++++++++++++++++++++++++++
 .github/workflows/gpu-ci.yml         |  6 ++---
 .github/workflows/multinode-test.yml |  6 ++---
 docker/oracle_con.py                 | 38 ++++++++++++++++++++++++++
 4 files changed, 84 insertions(+), 6 deletions(-)
 create mode 100644 docker/oracle_con.py

diff --git a/.github/workflows/build_legion.yml b/.github/workflows/build_legion.yml
index 067d9d3c9c..e49303c8f4 100644
--- a/.github/workflows/build_legion.yml
+++ b/.github/workflows/build_legion.yml
@@ -14,6 +14,26 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  start-oraicle-instance:
+    runs-on: ubuntu-latest
+    name: Turning on Oracle VM - flexflow-ci
+    env:
+      OCI_CLI_USER: ${{ secrets.OCI_CLI_USER }}
+      OCI_CLI_TENANCY: ${{ secrets.OCI_CLI_TENANCY }}
+      OCI_CLI_FINGERPRINT: ${{ secrets.OCI_CLI_FINGERPRINT }}
+      OCI_CLI_KEY_CONTENT: ${{ secrets.OCI_CLI_KEY_CONTENT }}
+      OCI_CLI_REGION: ${{ secrets.OCI_CLI_REGION }} 
+      OCI_INSTANCE_ID: ${{ secrets.INSTANCE_ID_FFCI }}
+    steps:
+      - name: Checkout Git Repository
+        uses: actions/checkout@v3
+
+      - name: Install Oci
+        run: pip install oci
+        
+      - name: Running script (START)
+        run: python3 docker/oracle_con.py --start --instance_id $OCI_INSTANCE_ID
+
   prebuild-legion:
     name: Prebuild Legion with CMake
     runs-on: [self-hosted, cpu_only]
@@ -104,3 +124,23 @@ jobs:
           TAG_NAME: ${{ env.RELEASE_DATETIME }}
           GITHUB_TOKEN: ${{ secrets.FLEXFLOW_TOKEN }}
         run: gh release create $TAG_NAME ./unwrapped_artifacts/*.tar.gz --repo flexflow/flexflow-third-party
+
+close-oraicle-instance:
+    runs-on: ubuntu-latest
+    name: Turning on Oracle VM - flexflow-ci
+    env:
+      OCI_CLI_USER: ${{ secrets.OCI_CLI_USER }}
+      OCI_CLI_TENANCY: ${{ secrets.OCI_CLI_TENANCY }}
+      OCI_CLI_FINGERPRINT: ${{ secrets.OCI_CLI_FINGERPRINT }}
+      OCI_CLI_KEY_CONTENT: ${{ secrets.OCI_CLI_KEY_CONTENT }}
+      OCI_CLI_REGION: ${{ secrets.OCI_CLI_REGION }} 
+      OCI_INSTANCE_ID: ${{ secrets.INSTANCE_ID_FFCI }}
+    steps:
+      - name: Checkout Git Repository
+        uses: actions/checkout@v3
+
+      - name: Install Oci
+        run: pip install oci
+        
+      - name: Running script (Close)
+        run: python3 docker/oracle_con.py --close --instance_id $OCI_INSTANCE_ID
\ No newline at end of file
diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
index d604a7cea9..aee16832f3 100644
--- a/.github/workflows/gpu-ci.yml
+++ b/.github/workflows/gpu-ci.yml
@@ -61,7 +61,7 @@ jobs:
 
   python-interface-check:
     name: Check Python Interface
-    runs-on: self-hosted
+    runs-on: [self-hosted, gpu]
     defaults:
       run:
         shell: bash -l {0} # required to use an activated conda environment
@@ -135,7 +135,7 @@ jobs:
 
   inference-tests:
     name: Inference Tests
-    runs-on: self-hosted
+    runs-on: [self-hosted, gpu]
     defaults:
       run:
         shell: bash -l {0} # required to use an activated conda environment
@@ -210,7 +210,7 @@ jobs:
 
   gpu-ci-flexflow:
     name: Single Machine, Multiple GPUs Tests
-    runs-on: self-hosted
+    runs-on: [self-hosted, gpu]
     # skip this time-consuming test for PRs to the inference branch
     # if: ${{ github.event_name != 'pull_request' || github.base_ref != 'inference' }}
     defaults:
diff --git a/.github/workflows/multinode-test.yml b/.github/workflows/multinode-test.yml
index 37f81b615f..bd13adae75 100644
--- a/.github/workflows/multinode-test.yml
+++ b/.github/workflows/multinode-test.yml
@@ -32,7 +32,7 @@ jobs:
     name: Multinode GPU Test with MPI
     # Prevent Github from running the workflow on forks
     if: github.repository_owner == 'flexflow'
-    runs-on: self-hosted
+    runs-on: [self-hosted, gpu]
     needs: gpu-ci-concierge
     # 10h timeout, instead of default of 360min (6h)
     timeout-minutes: 600
@@ -83,7 +83,7 @@ jobs:
     name: Multinode GPU Test with UCX
     # Prevent Github from running the workflow on forks
     if: github.repository_owner == 'flexflow'
-    runs-on: self-hosted
+    runs-on: [self-hosted, gpu]
     needs: gpu-ci-concierge
     container:
       image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
@@ -134,7 +134,7 @@ jobs:
     name: Multinode GPU Test with native UCX
     # Prevent Github from running the workflow on forks
     if: github.repository_owner == 'flexflow'
-    runs-on: self-hosted
+    runs-on: [self-hosted, gpu]
     needs: gpu-ci-concierge
     container:
       image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
diff --git a/docker/oracle_con.py b/docker/oracle_con.py
new file mode 100644
index 0000000000..4fd41930b7
--- /dev/null
+++ b/docker/oracle_con.py
@@ -0,0 +1,38 @@
+import oci
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description="Program with optional flags")
+group = parser.add_mutually_exclusive_group()
+group.add_argument("--start", action="store_true", help="Start action")
+group.add_argument("--stop", action="store_true", help="Stop action")
+parser.add_argument("--instance_id", type=str, required=True, help="instance id required")
+args = parser.parse_args()
+
+oci_key_content = os.getenv("OCI_CLI_KEY_CONTENT")
+
+config = {
+    "user": os.getenv("OCI_CLI_USER"),
+    "key_content": os.getenv("OCI_CLI_KEY_CONTENT"),
+    "fingerprint": os.getenv("OCI_CLI_FINGERPRINT"),
+    "tenancy": os.getenv("OCI_CLI_TENANCY"),
+    "region": os.getenv("OCI_CLI_REGION")
+}
+
+# Initialize the OCI configuration
+# config = oci.config.from_file()
+oci.config.validate_config(config)
+
+# Initialize the ComputeClient to interact with VM instances
+compute = oci.core.ComputeClient(config)
+
+# Replace 'your_instance_id' with the actual instance ID of your VM
+instance_id = args.instance_id
+
+# Perform the action
+if args.start:
+    # Start the VM
+    compute.instance_action(instance_id, "START")
+else:
+    # Stop the VM
+    compute.instance_action(instance_id, "STOP")