From b74c82524fb1a80a38a384f689a4c794d680bbaa Mon Sep 17 00:00:00 2001 From: jinbridge <2635480475@qq.com> Date: Tue, 5 Nov 2024 16:49:28 +0800 Subject: [PATCH 1/3] Add dummy model in iGPU perf --- python/llm/dev/benchmark/all-in-one/run.py | 73 +++++++++++++------ .../igpu-perf/1024-128_int4_fp16.yaml | 2 + .../1024-128_int4_fp16_loadlowbit.yaml | 2 + .../igpu-perf/2048-256_int4_fp16.yaml | 2 + .../igpu-perf/3072-384_int4_fp16.yaml | 2 + .../benchmark/igpu-perf/32-32_int4_fp16.yaml | 2 + .../igpu-perf/4096-512_int4_fp16.yaml | 2 + 7 files changed, 61 insertions(+), 24 deletions(-) diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py index 01f6a7c8aee..27433f546ba 100644 --- a/python/llm/dev/benchmark/all-in-one/run.py +++ b/python/llm/dev/benchmark/all-in-one/run.py @@ -53,6 +53,8 @@ MINICPM_V_IDS = ['openbmb/MiniCPM-V-2_6', 'openbmb/MiniCPM-Llama3-V-2_5'] +DUMMY_IDS = ['dummy/dummy-1.5B', 'dummy/dummy-4B'] + results = [] excludes = [] @@ -1369,6 +1371,11 @@ def run_transformer_int4_fp16_gpu_win(repo_id, tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) model = model.to('xpu') model = model.llm + if repo_id in DUMMY_IDS: + model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True, + trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding, + torch_dtype=torch.float16).eval() + model = model.to('xpu') else: model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit, trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding, @@ -1380,7 +1387,10 @@ def run_transformer_int4_fp16_gpu_win(repo_id, print(">> loading of model costs {}s and {}GB".format(load_time, torch.xpu.memory.memory_reserved()/(1024**3))) model = BenchmarkWrapper(model) - streamer = TextStreamer(tokenizer, skip_prompt=True) + if repo_id not in DUMMY_IDS: + streamer = TextStreamer(tokenizer, skip_prompt=True) + else: + streaming = False result = {} with torch.inference_mode(): @@ -1389,14 +1399,17 @@ def run_transformer_int4_fp16_gpu_win(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = get_continuation_input_str(in_len, tokenizer) - # As different tokenizer has different encodings, - # slice the input_ids to ensure the prompt length is required length. - input_ids = tokenizer.encode(input_str, return_tensors="pt") - input_ids = input_ids[:, :in_len] - true_str = tokenizer.batch_decode(input_ids)[0] - input_list = [true_str] * batch_size - input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu') + if repo_id not in DUMMY_IDS: + input_str = get_continuation_input_str(in_len, tokenizer) + # As different tokenizer has different encodings, + # slice the input_ids to ensure the prompt length is required length. + input_ids = tokenizer.encode(input_str, return_tensors="pt") + input_ids = input_ids[:, :in_len] + true_str = tokenizer.batch_decode(input_ids)[0] + input_list = [true_str] * batch_size + input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu') + else: + input_ids = torch.randint(1000, 2000, [1, in_len], dtype=torch.int64).to('xpu') actual_in_len = input_ids.shape[1] result[in_out] = [] for i in range(num_trials + warm_up): @@ -1413,9 +1426,10 @@ def run_transformer_int4_fp16_gpu_win(repo_id, end = time.perf_counter() output_ids = output_ids.cpu() print("model generate cost: " + str(end - st)) - output = tokenizer.batch_decode(output_ids) - if not streaming: - print(output[0]) + if repo_id not in DUMMY_IDS: + output = tokenizer.batch_decode(output_ids) + if not streaming: + print(output[0]) actual_out_len = output_ids.shape[1] - actual_in_len if i >= warm_up: result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time, @@ -1590,6 +1604,10 @@ def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id, use_cache=True, cpu_embedding=cpu_embedding).eval() tokenizer = AutoTokenizer.from_pretrained(model_path+'-'+low_bit, trust_remote_code=True) model = model.half().to('xpu') + if repo_id in DUMMY_IDS: + model = AutoModelForCausalLM.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True, + use_cache=True, cpu_embedding=cpu_embedding).eval() + model = model.to('xpu') else: model = AutoModelForCausalLM.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval() @@ -1600,7 +1618,10 @@ def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id, print(">> loading of model costs {}s and {}GB".format(load_time, torch.xpu.memory.memory_reserved()/(1024**3))) model = BenchmarkWrapper(model) - streamer = TextStreamer(tokenizer, skip_prompt=True) + if repo_id not in DUMMY_IDS: + streamer = TextStreamer(tokenizer, skip_prompt=True) + else: + streaming = False result = {} with torch.inference_mode(): @@ -1609,14 +1630,17 @@ def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = get_continuation_input_str(in_len, tokenizer) - # As different tokenizer has different encodings, - # slice the input_ids to ensure the prompt length is required length. - input_ids = tokenizer.encode(input_str, return_tensors="pt") - input_ids = input_ids[:, :in_len] - true_str = tokenizer.batch_decode(input_ids)[0] - input_list = [true_str] * batch_size - input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu') + if repo_id not in DUMMY_IDS: + input_str = get_continuation_input_str(in_len, tokenizer) + # As different tokenizer has different encodings, + # slice the input_ids to ensure the prompt length is required length. + input_ids = tokenizer.encode(input_str, return_tensors="pt") + input_ids = input_ids[:, :in_len] + true_str = tokenizer.batch_decode(input_ids)[0] + input_list = [true_str] * batch_size + input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu') + else: + input_ids = torch.randint(1000, 2000, [1, in_len], dtype=torch.int64).to('xpu') actual_in_len = input_ids.shape[1] result[in_out] = [] for i in range(num_trials + warm_up): @@ -1633,9 +1657,10 @@ def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id, end = time.perf_counter() output_ids = output_ids.cpu() print("model generate cost: " + str(end - st)) - output = tokenizer.batch_decode(output_ids) - if not streaming: - print(output[0]) + if repo_id not in DUMMY_IDS: + output = tokenizer.batch_decode(output_ids) + if not streaming: + print(output[0]) actual_out_len = output_ids.shape[1] - actual_in_len if i >= warm_up: result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time, diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml index f66172d9a39..60930214e8d 100644 --- a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml +++ b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml @@ -18,6 +18,8 @@ repo_id: - 'microsoft/Phi-3-mini-128k-instruct' - 'microsoft/phi-3-vision-128k-instruct' - 'openbmb/MiniCPM-V-2_6' + - 'dummy/dummy-1.5B' + - 'dummy/dummy-4B' local_model_hub: 'path to your local model hub' warm_up: 1 num_trials: 3 diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit.yaml index 76c35d4dde7..8c831d633a1 100644 --- a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit.yaml +++ b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit.yaml @@ -17,6 +17,8 @@ repo_id: - 'microsoft/Phi-3-mini-4k-instruct' - 'microsoft/Phi-3-mini-128k-instruct' - 'microsoft/phi-3-vision-128k-instruct' + - 'dummy/dummy-1.5B' + - 'dummy/dummy-4B' local_model_hub: 'path to your local model hub' warm_up: 1 num_trials: 3 diff --git a/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16.yaml b/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16.yaml index bf5fc1e978b..ebffb515f97 100644 --- a/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16.yaml +++ b/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16.yaml @@ -18,6 +18,8 @@ repo_id: - 'microsoft/Phi-3-mini-128k-instruct' - 'microsoft/phi-3-vision-128k-instruct' - 'openbmb/MiniCPM-V-2_6' + - 'dummy/dummy-1.5B' + - 'dummy/dummy-4B' local_model_hub: 'path to your local model hub' warm_up: 1 num_trials: 3 diff --git a/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16.yaml b/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16.yaml index 60202594cba..3506b4054ad 100644 --- a/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16.yaml +++ b/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16.yaml @@ -17,6 +17,8 @@ repo_id: - 'microsoft/Phi-3-mini-128k-instruct' - 'microsoft/phi-3-vision-128k-instruct' - 'openbmb/MiniCPM-V-2_6' + - 'dummy/dummy-1.5B' + - 'dummy/dummy-4B' local_model_hub: 'path to your local model hub' warm_up: 1 num_trials: 3 diff --git a/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16.yaml b/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16.yaml index e70178744a3..da0f27c8822 100644 --- a/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16.yaml +++ b/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16.yaml @@ -18,6 +18,8 @@ repo_id: - 'microsoft/Phi-3-mini-128k-instruct' - 'microsoft/phi-3-vision-128k-instruct' - 'openbmb/MiniCPM-V-2_6' + - 'dummy/dummy-1.5B' + - 'dummy/dummy-4B' local_model_hub: 'path to your local model hub' warm_up: 3 num_trials: 5 diff --git a/python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16.yaml b/python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16.yaml index 514037a7380..0d955d73298 100644 --- a/python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16.yaml +++ b/python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16.yaml @@ -17,6 +17,8 @@ repo_id: - 'microsoft/Phi-3-mini-128k-instruct' - 'microsoft/phi-3-vision-128k-instruct' - 'openbmb/MiniCPM-V-2_6' + - 'dummy/dummy-1.5B' + - 'dummy/dummy-4B' local_model_hub: 'path to your local model hub' warm_up: 1 num_trials: 3 From f47f72b61e89b6f13b5993596ad903d6f632bb2c Mon Sep 17 00:00:00 2001 From: jinbridge <2635480475@qq.com> Date: Tue, 5 Nov 2024 16:53:04 +0800 Subject: [PATCH 2/3] Add dummy model in iGPU perf --- python/llm/test/benchmark/igpu-perf/1024-128.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/llm/test/benchmark/igpu-perf/1024-128.yaml b/python/llm/test/benchmark/igpu-perf/1024-128.yaml index 759a7566237..c30cdebe3a3 100644 --- a/python/llm/test/benchmark/igpu-perf/1024-128.yaml +++ b/python/llm/test/benchmark/igpu-perf/1024-128.yaml @@ -19,6 +19,8 @@ repo_id: - 'microsoft/Phi-3-mini-128k-instruct' - 'microsoft/phi-3-vision-128k-instruct' - 'openbmb/MiniCPM-V-2_6' + - 'dummy/dummy-1.5B' + - 'dummy/dummy-4B' local_model_hub: 'path to your local model hub' warm_up: 1 num_trials: 3 From f70b97e9d052739477ca0326d75dae62b33ddfd7 Mon Sep 17 00:00:00 2001 From: jinbridge <2635480475@qq.com> Date: Tue, 5 Nov 2024 16:58:38 +0800 Subject: [PATCH 3/3] Fix --- python/llm/dev/benchmark/all-in-one/run.py | 43 ++++++++++++++-------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py index 27433f546ba..3ffb84c2e50 100644 --- a/python/llm/dev/benchmark/all-in-one/run.py +++ b/python/llm/dev/benchmark/all-in-one/run.py @@ -1247,6 +1247,10 @@ def run_transformer_int4_gpu_win(repo_id, tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) model = model.to('xpu') model = model.llm + elif repo_id in DUMMY_IDS: + model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit, + trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval() + model = model.to('xpu') else: model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit, trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval() @@ -1257,7 +1261,10 @@ def run_transformer_int4_gpu_win(repo_id, print(">> loading of model costs {}s and {}GB".format(load_time, torch.xpu.memory.memory_reserved()/(1024**3))) model = BenchmarkWrapper(model) - streamer = TextStreamer(tokenizer, skip_prompt=True) + if repo_id not in DUMMY_IDS: + streamer = TextStreamer(tokenizer, skip_prompt=True) + else: + streaming = False result = {} with torch.inference_mode(): @@ -1266,14 +1273,17 @@ def run_transformer_int4_gpu_win(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = get_continuation_input_str(in_len, tokenizer) - # As different tokenizer has different encodings, - # slice the input_ids to ensure the prompt length is required length. - input_ids = tokenizer.encode(input_str, return_tensors="pt") - input_ids = input_ids[:, :in_len] - true_str = tokenizer.batch_decode(input_ids)[0] - input_list = [true_str] * batch_size - input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu') + if repo_id not in DUMMY_IDS: + input_str = get_continuation_input_str(in_len, tokenizer) + # As different tokenizer has different encodings, + # slice the input_ids to ensure the prompt length is required length. + input_ids = tokenizer.encode(input_str, return_tensors="pt") + input_ids = input_ids[:, :in_len] + true_str = tokenizer.batch_decode(input_ids)[0] + input_list = [true_str] * batch_size + input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu') + else: + input_ids = torch.randint(1000, 2000, [batch_size, in_len], dtype=torch.int64).to('xpu') actual_in_len = input_ids.shape[1] result[in_out] = [] for i in range(num_trials + warm_up): @@ -1290,9 +1300,10 @@ def run_transformer_int4_gpu_win(repo_id, end = time.perf_counter() output_ids = output_ids.cpu() print("model generate cost: " + str(end - st)) - output = tokenizer.batch_decode(output_ids) - if not streaming: - print(output[0]) + if repo_id not in DUMMY_IDS: + output = tokenizer.batch_decode(output_ids) + if not streaming: + print(output[0]) actual_out_len = output_ids.shape[1] - actual_in_len if i >= warm_up: result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time, @@ -1371,7 +1382,7 @@ def run_transformer_int4_fp16_gpu_win(repo_id, tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) model = model.to('xpu') model = model.llm - if repo_id in DUMMY_IDS: + elif repo_id in DUMMY_IDS: model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True, trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding, torch_dtype=torch.float16).eval() @@ -1409,7 +1420,7 @@ def run_transformer_int4_fp16_gpu_win(repo_id, input_list = [true_str] * batch_size input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu') else: - input_ids = torch.randint(1000, 2000, [1, in_len], dtype=torch.int64).to('xpu') + input_ids = torch.randint(1000, 2000, [batch_size, in_len], dtype=torch.int64).to('xpu') actual_in_len = input_ids.shape[1] result[in_out] = [] for i in range(num_trials + warm_up): @@ -1604,7 +1615,7 @@ def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id, use_cache=True, cpu_embedding=cpu_embedding).eval() tokenizer = AutoTokenizer.from_pretrained(model_path+'-'+low_bit, trust_remote_code=True) model = model.half().to('xpu') - if repo_id in DUMMY_IDS: + elif repo_id in DUMMY_IDS: model = AutoModelForCausalLM.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval() model = model.to('xpu') @@ -1640,7 +1651,7 @@ def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id, input_list = [true_str] * batch_size input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu') else: - input_ids = torch.randint(1000, 2000, [1, in_len], dtype=torch.int64).to('xpu') + input_ids = torch.randint(1000, 2000, [batch_size, in_len], dtype=torch.int64).to('xpu') actual_in_len = input_ids.shape[1] result[in_out] = [] for i in range(num_trials + warm_up):