Skip to content

Commit

Permalink
Add dummy model in iGPU perf (#12341)
Browse files Browse the repository at this point in the history
* Add dummy model in iGPU perf

* Add dummy model in iGPU perf

* Fix
  • Loading branch information
JinBridger authored Nov 5, 2024
1 parent 8e9a3a1 commit 7240c28
Show file tree
Hide file tree
Showing 8 changed files with 86 additions and 36 deletions.
108 changes: 72 additions & 36 deletions python/llm/dev/benchmark/all-in-one/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@

MINICPM_V_IDS = ['openbmb/MiniCPM-V-2_6', 'openbmb/MiniCPM-Llama3-V-2_5']

DUMMY_IDS = ['dummy/dummy-1.5B', 'dummy/dummy-4B']

results = []
excludes = []

Expand Down Expand Up @@ -1245,6 +1247,10 @@ def run_transformer_int4_gpu_win(repo_id,
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = model.to('xpu')
model = model.llm
elif repo_id in DUMMY_IDS:
model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval()
model = model.to('xpu')
else:
model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval()
Expand All @@ -1255,7 +1261,10 @@ def run_transformer_int4_gpu_win(repo_id,
print(">> loading of model costs {}s and {}GB".format(load_time, torch.xpu.memory.memory_reserved()/(1024**3)))

model = BenchmarkWrapper(model)
streamer = TextStreamer(tokenizer, skip_prompt=True)
if repo_id not in DUMMY_IDS:
streamer = TextStreamer(tokenizer, skip_prompt=True)
else:
streaming = False

result = {}
with torch.inference_mode():
Expand All @@ -1264,14 +1273,17 @@ def run_transformer_int4_gpu_win(repo_id,
in_out_len = in_out.split("-")
in_len = int(in_out_len[0])
out_len = int(in_out_len[1])
input_str = get_continuation_input_str(in_len, tokenizer)
# As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0]
input_list = [true_str] * batch_size
input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
if repo_id not in DUMMY_IDS:
input_str = get_continuation_input_str(in_len, tokenizer)
# As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0]
input_list = [true_str] * batch_size
input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
else:
input_ids = torch.randint(1000, 2000, [batch_size, in_len], dtype=torch.int64).to('xpu')
actual_in_len = input_ids.shape[1]
result[in_out] = []
for i in range(num_trials + warm_up):
Expand All @@ -1288,9 +1300,10 @@ def run_transformer_int4_gpu_win(repo_id,
end = time.perf_counter()
output_ids = output_ids.cpu()
print("model generate cost: " + str(end - st))
output = tokenizer.batch_decode(output_ids)
if not streaming:
print(output[0])
if repo_id not in DUMMY_IDS:
output = tokenizer.batch_decode(output_ids)
if not streaming:
print(output[0])
actual_out_len = output_ids.shape[1] - actual_in_len
if i >= warm_up:
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
Expand Down Expand Up @@ -1369,6 +1382,11 @@ def run_transformer_int4_fp16_gpu_win(repo_id,
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = model.to('xpu')
model = model.llm
elif repo_id in DUMMY_IDS:
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True,
trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding,
torch_dtype=torch.float16).eval()
model = model.to('xpu')
else:
model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding,
Expand All @@ -1380,7 +1398,10 @@ def run_transformer_int4_fp16_gpu_win(repo_id,
print(">> loading of model costs {}s and {}GB".format(load_time, torch.xpu.memory.memory_reserved()/(1024**3)))

model = BenchmarkWrapper(model)
streamer = TextStreamer(tokenizer, skip_prompt=True)
if repo_id not in DUMMY_IDS:
streamer = TextStreamer(tokenizer, skip_prompt=True)
else:
streaming = False

result = {}
with torch.inference_mode():
Expand All @@ -1389,14 +1410,17 @@ def run_transformer_int4_fp16_gpu_win(repo_id,
in_out_len = in_out.split("-")
in_len = int(in_out_len[0])
out_len = int(in_out_len[1])
input_str = get_continuation_input_str(in_len, tokenizer)
# As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0]
input_list = [true_str] * batch_size
input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
if repo_id not in DUMMY_IDS:
input_str = get_continuation_input_str(in_len, tokenizer)
# As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0]
input_list = [true_str] * batch_size
input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
else:
input_ids = torch.randint(1000, 2000, [batch_size, in_len], dtype=torch.int64).to('xpu')
actual_in_len = input_ids.shape[1]
result[in_out] = []
for i in range(num_trials + warm_up):
Expand All @@ -1413,9 +1437,10 @@ def run_transformer_int4_fp16_gpu_win(repo_id,
end = time.perf_counter()
output_ids = output_ids.cpu()
print("model generate cost: " + str(end - st))
output = tokenizer.batch_decode(output_ids)
if not streaming:
print(output[0])
if repo_id not in DUMMY_IDS:
output = tokenizer.batch_decode(output_ids)
if not streaming:
print(output[0])
actual_out_len = output_ids.shape[1] - actual_in_len
if i >= warm_up:
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
Expand Down Expand Up @@ -1590,6 +1615,10 @@ def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id,
use_cache=True, cpu_embedding=cpu_embedding).eval()
tokenizer = AutoTokenizer.from_pretrained(model_path+'-'+low_bit, trust_remote_code=True)
model = model.half().to('xpu')
elif repo_id in DUMMY_IDS:
model = AutoModelForCausalLM.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True,
use_cache=True, cpu_embedding=cpu_embedding).eval()
model = model.to('xpu')
else:
model = AutoModelForCausalLM.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True,
use_cache=True, cpu_embedding=cpu_embedding).eval()
Expand All @@ -1600,7 +1629,10 @@ def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id,
print(">> loading of model costs {}s and {}GB".format(load_time, torch.xpu.memory.memory_reserved()/(1024**3)))

model = BenchmarkWrapper(model)
streamer = TextStreamer(tokenizer, skip_prompt=True)
if repo_id not in DUMMY_IDS:
streamer = TextStreamer(tokenizer, skip_prompt=True)
else:
streaming = False

result = {}
with torch.inference_mode():
Expand All @@ -1609,14 +1641,17 @@ def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id,
in_out_len = in_out.split("-")
in_len = int(in_out_len[0])
out_len = int(in_out_len[1])
input_str = get_continuation_input_str(in_len, tokenizer)
# As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0]
input_list = [true_str] * batch_size
input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
if repo_id not in DUMMY_IDS:
input_str = get_continuation_input_str(in_len, tokenizer)
# As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0]
input_list = [true_str] * batch_size
input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
else:
input_ids = torch.randint(1000, 2000, [batch_size, in_len], dtype=torch.int64).to('xpu')
actual_in_len = input_ids.shape[1]
result[in_out] = []
for i in range(num_trials + warm_up):
Expand All @@ -1633,9 +1668,10 @@ def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id,
end = time.perf_counter()
output_ids = output_ids.cpu()
print("model generate cost: " + str(end - st))
output = tokenizer.batch_decode(output_ids)
if not streaming:
print(output[0])
if repo_id not in DUMMY_IDS:
output = tokenizer.batch_decode(output_ids)
if not streaming:
print(output[0])
actual_out_len = output_ids.shape[1] - actual_in_len
if i >= warm_up:
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
Expand Down
2 changes: 2 additions & 0 deletions python/llm/test/benchmark/igpu-perf/1024-128.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ repo_id:
- 'microsoft/Phi-3-mini-128k-instruct'
- 'microsoft/phi-3-vision-128k-instruct'
- 'openbmb/MiniCPM-V-2_6'
- 'dummy/dummy-1.5B'
- 'dummy/dummy-4B'
local_model_hub: 'path to your local model hub'
warm_up: 1
num_trials: 3
Expand Down
2 changes: 2 additions & 0 deletions python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ repo_id:
- 'microsoft/Phi-3-mini-128k-instruct'
- 'microsoft/phi-3-vision-128k-instruct'
- 'openbmb/MiniCPM-V-2_6'
- 'dummy/dummy-1.5B'
- 'dummy/dummy-4B'
local_model_hub: 'path to your local model hub'
warm_up: 1
num_trials: 3
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ repo_id:
- 'microsoft/Phi-3-mini-4k-instruct'
- 'microsoft/Phi-3-mini-128k-instruct'
- 'microsoft/phi-3-vision-128k-instruct'
- 'dummy/dummy-1.5B'
- 'dummy/dummy-4B'
local_model_hub: 'path to your local model hub'
warm_up: 1
num_trials: 3
Expand Down
2 changes: 2 additions & 0 deletions python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ repo_id:
- 'microsoft/Phi-3-mini-128k-instruct'
- 'microsoft/phi-3-vision-128k-instruct'
- 'openbmb/MiniCPM-V-2_6'
- 'dummy/dummy-1.5B'
- 'dummy/dummy-4B'
local_model_hub: 'path to your local model hub'
warm_up: 1
num_trials: 3
Expand Down
2 changes: 2 additions & 0 deletions python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ repo_id:
- 'microsoft/Phi-3-mini-128k-instruct'
- 'microsoft/phi-3-vision-128k-instruct'
- 'openbmb/MiniCPM-V-2_6'
- 'dummy/dummy-1.5B'
- 'dummy/dummy-4B'
local_model_hub: 'path to your local model hub'
warm_up: 1
num_trials: 3
Expand Down
2 changes: 2 additions & 0 deletions python/llm/test/benchmark/igpu-perf/32-32_int4_fp16.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ repo_id:
- 'microsoft/Phi-3-mini-128k-instruct'
- 'microsoft/phi-3-vision-128k-instruct'
- 'openbmb/MiniCPM-V-2_6'
- 'dummy/dummy-1.5B'
- 'dummy/dummy-4B'
local_model_hub: 'path to your local model hub'
warm_up: 3
num_trials: 5
Expand Down
2 changes: 2 additions & 0 deletions python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ repo_id:
- 'microsoft/Phi-3-mini-128k-instruct'
- 'microsoft/phi-3-vision-128k-instruct'
- 'openbmb/MiniCPM-V-2_6'
- 'dummy/dummy-1.5B'
- 'dummy/dummy-4B'
local_model_hub: 'path to your local model hub'
warm_up: 1
num_trials: 3
Expand Down

0 comments on commit 7240c28

Please sign in to comment.