Skip to content

Commit

Permalink
refine auto scaling
Browse files Browse the repository at this point in the history
  • Loading branch information
smtmfft committed Aug 14, 2024
1 parent 095602f commit 70636b4
Showing 1 changed file with 79 additions and 39 deletions.
118 changes: 79 additions & 39 deletions provers/risc0/driver/src/bonsai/auto_scaling.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use anyhow::{Error, Result};
use anyhow::{Error, Ok, Result};
use lazy_static::lazy_static;
use reqwest::{header::HeaderMap, header::HeaderValue, header::CONTENT_TYPE, Client};
use serde::Deserialize;
Expand All @@ -14,6 +14,8 @@ struct ScalerResponse {
struct BonsaiAutoScaler {
url: String,
headers: HeaderMap,
client: Client,
on_setting_status: Option<ScalerResponse>,
}

impl BonsaiAutoScaler {
Expand All @@ -22,16 +24,20 @@ impl BonsaiAutoScaler {
let mut headers = HeaderMap::new();
headers.insert(CONTENT_TYPE, HeaderValue::from_static("application/json"));
headers.insert("x-api-key", HeaderValue::from_str(&api_key).unwrap());
Self { url, headers }

Self {
url,
headers,
client: Client::new(),
on_setting_status: None,
}
}

async fn get_bonsai_gpu_num(&self) -> Result<ScalerResponse> {
// Create a new client
let client = Client::new();

debug!("Requesting scaler status from: {}", self.url);
// Make the POST request
let response = client
let response = self
.client
.get(self.url.clone())
.headers(self.headers.clone())
.send()
Expand All @@ -49,12 +55,16 @@ impl BonsaiAutoScaler {
}
}

async fn set_bonsai_gpu_num(&self, gpu_num: u32) -> Result<()> {
// Create a new client
let client = Client::new();
async fn set_bonsai_gpu_num(&mut self, gpu_num: u32) -> Result<()> {
if self.on_setting_status.is_some() {
// log an err if there is a race adjustment.
trace_err!("Last bonsai setting is not active, please check.");
}

debug!("Requesting scaler status from: {}", self.url);
// Make the POST request
let response = client
let response = self
.client
.post(self.url.clone())
.headers(self.headers.clone())
.body(gpu_num.to_string())
Expand All @@ -63,75 +73,101 @@ impl BonsaiAutoScaler {

// Check if the request was successful
if response.status().is_success() {
// Parse the JSON response
let data: ScalerResponse = response.json().await?;

// Use the parsed data
debug!("Scaler status: {:?}", data);
assert_eq!(data.desired, gpu_num);
self.on_setting_status = Some(ScalerResponse {
desired: gpu_num,
current: 0,
pending: 0,
});
Ok(())
} else {
trace_err!("Request failed with status: {}", response.status());
Err(Error::msg("Failed to get bonsai gpu num".to_string()))
}
}

async fn wait_for_bonsai_config_active(&mut self, time_out_sec: u64) -> Result<()> {
match &self.on_setting_status {
None => Ok(()),
Some(setting) => {
// loop until some timeout
let start_time = std::time::Instant::now();
let mut check_time = std::time::Instant::now();
while check_time.duration_since(start_time).as_secs() < time_out_sec {
tokio::time::sleep(tokio::time::Duration::from_secs(10)).await;
check_time = std::time::Instant::now();
let current_bonsai_gpu_num = self.get_bonsai_gpu_num().await?;
if current_bonsai_gpu_num.current == setting.desired {
self.on_setting_status = None;
return Ok(());
}
}
Err(Error::msg(
"checking bonsai config active timeout".to_string(),
))
}
}
}
}

lazy_static! {
static ref BONSAI_API_URL: String =
env::var("BONSAI_API_URL").expect("BONSAI_API_URL must be set");
static ref BONSAI_API_KEY: String =
env::var("BONSAI_API_KEY").expect("BONSAI_API_KEY must be set");
static ref MAX_BONSAI_GPU_NUM: u32 = env::var("MAX_BONSAI_GPU_NUM")
.unwrap_or_else(|_| "15".to_string())
.parse()
.unwrap();
}

const MAX_BONSAI_GPU_NUM: u32 = 15;

pub(crate) async fn maxpower_bonsai() -> Result<()> {
let auto_scaler = BonsaiAutoScaler::new(BONSAI_API_URL.to_string(), BONSAI_API_KEY.to_string());
let mut auto_scaler =
BonsaiAutoScaler::new(BONSAI_API_URL.to_string(), BONSAI_API_KEY.to_string());
let current_gpu_num = auto_scaler.get_bonsai_gpu_num().await?;
// either already maxed out or pending to be maxed out
if current_gpu_num.current == MAX_BONSAI_GPU_NUM
|| (current_gpu_num.current + current_gpu_num.pending == MAX_BONSAI_GPU_NUM)
if current_gpu_num.current == *MAX_BONSAI_GPU_NUM
&& current_gpu_num.desired == *MAX_BONSAI_GPU_NUM
&& current_gpu_num.pending == 0
{
Ok(())
} else {
auto_scaler.set_bonsai_gpu_num(MAX_BONSAI_GPU_NUM).await?;
// wait for the bonsai to heat up
tokio::time::sleep(tokio::time::Duration::from_secs(180)).await;
let scaler_status = auto_scaler.get_bonsai_gpu_num().await?;
assert!(scaler_status.current == MAX_BONSAI_GPU_NUM);
Ok(())
auto_scaler.set_bonsai_gpu_num(*MAX_BONSAI_GPU_NUM).await?;
auto_scaler.wait_for_bonsai_config_active(300).await
}
}

pub(crate) async fn shutdown_bonsai() -> Result<()> {
let auto_scaler = BonsaiAutoScaler::new(BONSAI_API_URL.to_string(), BONSAI_API_KEY.to_string());
let mut auto_scaler =
BonsaiAutoScaler::new(BONSAI_API_URL.to_string(), BONSAI_API_KEY.to_string());
let current_gpu_num = auto_scaler.get_bonsai_gpu_num().await?;
if current_gpu_num.current == 0 {
if current_gpu_num.current == 0 && current_gpu_num.pending == 0 && current_gpu_num.desired == 0
{
Ok(())
} else {
auto_scaler.set_bonsai_gpu_num(0).await?;
// wait few minute for the bonsai to cool down
tokio::time::sleep(tokio::time::Duration::from_secs(60)).await;
let scaler_status = auto_scaler.get_bonsai_gpu_num().await?;
assert!(scaler_status.current == 0);
Ok(())
auto_scaler.wait_for_bonsai_config_active(30).await
}
}

#[cfg(test)]
mod test {
use super::*;
use std::env;
use tokio;

#[ignore]
#[tokio::test]
async fn test_bonsai_auto_scaler_get() {
let bonsai_url = env::var("BONSAI_API_URL").expect("BONSAI_API_URL must be set");
let bonsai_key = env::var("BONSAI_API_KEY").expect("BONSAI_API_KEY must be set");
let max_bonsai_gpu: u32 = env::var("MAX_BONSAI_GPU_NUM")
.unwrap_or_else(|_| "15".to_string())
.parse()
.unwrap();
let auto_scaler = BonsaiAutoScaler::new(bonsai_url, bonsai_key);
let scalar_status = auto_scaler.get_bonsai_gpu_num().await.unwrap();
assert!(scalar_status.current <= MAX_BONSAI_GPU_NUM);
assert!(scalar_status.current <= max_bonsai_gpu);
assert_eq!(
scalar_status.desired,
scalar_status.current + scalar_status.pending
Expand All @@ -143,23 +179,27 @@ mod test {
async fn test_bonsai_auto_scaler_set() {
let bonsai_url = env::var("BONSAI_API_URL").expect("BONSAI_API_URL must be set");
let bonsai_key = env::var("BONSAI_API_KEY").expect("BONSAI_API_KEY must be set");
let auto_scaler = BonsaiAutoScaler::new(bonsai_url, bonsai_key);
let mut auto_scaler = BonsaiAutoScaler::new(bonsai_url, bonsai_key);

auto_scaler
.set_bonsai_gpu_num(7)
.await
.expect("Failed to set bonsai gpu num");
// wait few minute for the bonsai to heat up
tokio::time::sleep(tokio::time::Duration::from_secs(200)).await;
auto_scaler
.wait_for_bonsai_config_active(300)
.await
.unwrap();
let current_gpu_num = auto_scaler.get_bonsai_gpu_num().await.unwrap().current;
assert_eq!(current_gpu_num, 7);

auto_scaler
.set_bonsai_gpu_num(0)
.await
.expect("Failed to set bonsai gpu num");
// wait few minute for the bonsai to cool down
tokio::time::sleep(tokio::time::Duration::from_secs(30)).await;
auto_scaler
.wait_for_bonsai_config_active(300)
.await
.unwrap();
let current_gpu_num = auto_scaler.get_bonsai_gpu_num().await.unwrap().current;
assert_eq!(current_gpu_num, 0);
}
Expand Down

0 comments on commit 70636b4

Please sign in to comment.