From 7240c283a3c553d34ef8e2e97a00dc24418a808a Mon Sep 17 00:00:00 2001
From: "Jin, Qiao" <89779290+JinBridger@users.noreply.github.com>
Date: Tue, 5 Nov 2024 17:56:10 +0800
Subject: [PATCH] Add dummy model in iGPU perf (#12341)

* Add dummy model in iGPU perf

* Add dummy model in iGPU perf

* Fix
---
 python/llm/dev/benchmark/all-in-one/run.py    | 108 ++++++++++++------
 .../test/benchmark/igpu-perf/1024-128.yaml    |   2 +
 .../igpu-perf/1024-128_int4_fp16.yaml         |   2 +
 .../1024-128_int4_fp16_loadlowbit.yaml        |   2 +
 .../igpu-perf/2048-256_int4_fp16.yaml         |   2 +
 .../igpu-perf/3072-384_int4_fp16.yaml         |   2 +
 .../benchmark/igpu-perf/32-32_int4_fp16.yaml  |   2 +
 .../igpu-perf/4096-512_int4_fp16.yaml         |   2 +
 8 files changed, 86 insertions(+), 36 deletions(-)

diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py
index 01f6a7c8aee..3ffb84c2e50 100644
--- a/python/llm/dev/benchmark/all-in-one/run.py
+++ b/python/llm/dev/benchmark/all-in-one/run.py
@@ -53,6 +53,8 @@
 
 MINICPM_V_IDS = ['openbmb/MiniCPM-V-2_6', 'openbmb/MiniCPM-Llama3-V-2_5']
 
+DUMMY_IDS = ['dummy/dummy-1.5B', 'dummy/dummy-4B']
+
 results = []
 excludes = []
 
@@ -1245,6 +1247,10 @@ def run_transformer_int4_gpu_win(repo_id,
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
         model = model.to('xpu')
         model = model.llm
+    elif repo_id in DUMMY_IDS:
+        model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
+                                                     trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval()
+        model = model.to('xpu')
     else:
         model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
                                                      trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval()
@@ -1255,7 +1261,10 @@ def run_transformer_int4_gpu_win(repo_id,
     print(">> loading of model costs {}s and {}GB".format(load_time, torch.xpu.memory.memory_reserved()/(1024**3)))
 
     model = BenchmarkWrapper(model)
-    streamer = TextStreamer(tokenizer, skip_prompt=True)
+    if repo_id not in DUMMY_IDS:
+        streamer = TextStreamer(tokenizer, skip_prompt=True)
+    else:
+        streaming = False
 
     result = {}
     with torch.inference_mode():
@@ -1264,14 +1273,17 @@ def run_transformer_int4_gpu_win(repo_id,
                 in_out_len = in_out.split("-")
                 in_len = int(in_out_len[0])
                 out_len = int(in_out_len[1])
-                input_str = get_continuation_input_str(in_len, tokenizer)
-                # As different tokenizer has different encodings,
-                # slice the input_ids to ensure the prompt length is required length.
-                input_ids = tokenizer.encode(input_str, return_tensors="pt")
-                input_ids = input_ids[:, :in_len]
-                true_str = tokenizer.batch_decode(input_ids)[0]
-                input_list = [true_str] * batch_size
-                input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
+                if repo_id not in DUMMY_IDS:
+                    input_str = get_continuation_input_str(in_len, tokenizer)
+                    # As different tokenizer has different encodings,
+                    # slice the input_ids to ensure the prompt length is required length.
+                    input_ids = tokenizer.encode(input_str, return_tensors="pt")
+                    input_ids = input_ids[:, :in_len]
+                    true_str = tokenizer.batch_decode(input_ids)[0]
+                    input_list = [true_str] * batch_size
+                    input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
+                else:
+                    input_ids = torch.randint(1000, 2000, [batch_size, in_len], dtype=torch.int64).to('xpu')
                 actual_in_len = input_ids.shape[1]
                 result[in_out] = []
                 for i in range(num_trials + warm_up):
@@ -1288,9 +1300,10 @@ def run_transformer_int4_gpu_win(repo_id,
                     end = time.perf_counter()
                     output_ids = output_ids.cpu()
                     print("model generate cost: " + str(end - st))
-                    output = tokenizer.batch_decode(output_ids)
-                    if not streaming:
-                        print(output[0])
+                    if repo_id not in DUMMY_IDS:
+                        output = tokenizer.batch_decode(output_ids)
+                        if not streaming:
+                            print(output[0])
                     actual_out_len = output_ids.shape[1] - actual_in_len
                     if i >= warm_up:
                         result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
@@ -1369,6 +1382,11 @@ def run_transformer_int4_fp16_gpu_win(repo_id,
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
         model = model.to('xpu')
         model = model.llm
+    elif repo_id in DUMMY_IDS:
+        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True,
+                                                     trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding,
+                                                     torch_dtype=torch.float16).eval()
+        model = model.to('xpu')
     else:
         model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
                                                      trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding,
@@ -1380,7 +1398,10 @@ def run_transformer_int4_fp16_gpu_win(repo_id,
     print(">> loading of model costs {}s and {}GB".format(load_time, torch.xpu.memory.memory_reserved()/(1024**3)))
 
     model = BenchmarkWrapper(model)
-    streamer = TextStreamer(tokenizer, skip_prompt=True)
+    if repo_id not in DUMMY_IDS:
+        streamer = TextStreamer(tokenizer, skip_prompt=True)
+    else:
+        streaming = False
 
     result = {}
     with torch.inference_mode():
@@ -1389,14 +1410,17 @@ def run_transformer_int4_fp16_gpu_win(repo_id,
                 in_out_len = in_out.split("-")
                 in_len = int(in_out_len[0])
                 out_len = int(in_out_len[1])
-                input_str = get_continuation_input_str(in_len, tokenizer)
-                # As different tokenizer has different encodings,
-                # slice the input_ids to ensure the prompt length is required length.
-                input_ids = tokenizer.encode(input_str, return_tensors="pt")
-                input_ids = input_ids[:, :in_len]
-                true_str = tokenizer.batch_decode(input_ids)[0]
-                input_list = [true_str] * batch_size
-                input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
+                if repo_id not in DUMMY_IDS:
+                    input_str = get_continuation_input_str(in_len, tokenizer)
+                    # As different tokenizer has different encodings,
+                    # slice the input_ids to ensure the prompt length is required length.
+                    input_ids = tokenizer.encode(input_str, return_tensors="pt")
+                    input_ids = input_ids[:, :in_len]
+                    true_str = tokenizer.batch_decode(input_ids)[0]
+                    input_list = [true_str] * batch_size
+                    input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
+                else:
+                    input_ids = torch.randint(1000, 2000, [batch_size, in_len], dtype=torch.int64).to('xpu')
                 actual_in_len = input_ids.shape[1]
                 result[in_out] = []
                 for i in range(num_trials + warm_up):
@@ -1413,9 +1437,10 @@ def run_transformer_int4_fp16_gpu_win(repo_id,
                     end = time.perf_counter()
                     output_ids = output_ids.cpu()
                     print("model generate cost: " + str(end - st))
-                    output = tokenizer.batch_decode(output_ids)
-                    if not streaming:
-                        print(output[0])
+                    if repo_id not in DUMMY_IDS:
+                        output = tokenizer.batch_decode(output_ids)
+                        if not streaming:
+                            print(output[0])
                     actual_out_len = output_ids.shape[1] - actual_in_len
                     if i >= warm_up:
                         result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
@@ -1590,6 +1615,10 @@ def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id,
                                                   use_cache=True, cpu_embedding=cpu_embedding).eval()
         tokenizer = AutoTokenizer.from_pretrained(model_path+'-'+low_bit, trust_remote_code=True)
         model = model.half().to('xpu')
+    elif repo_id in DUMMY_IDS:
+        model = AutoModelForCausalLM.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True,
+                                                  use_cache=True, cpu_embedding=cpu_embedding).eval()
+        model = model.to('xpu')
     else:
         model = AutoModelForCausalLM.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True,
                                                   use_cache=True, cpu_embedding=cpu_embedding).eval()
@@ -1600,7 +1629,10 @@ def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id,
     print(">> loading of model costs {}s and {}GB".format(load_time, torch.xpu.memory.memory_reserved()/(1024**3)))
 
     model = BenchmarkWrapper(model)
-    streamer = TextStreamer(tokenizer, skip_prompt=True)
+    if repo_id not in DUMMY_IDS:
+        streamer = TextStreamer(tokenizer, skip_prompt=True)
+    else:
+        streaming = False
 
     result = {}
     with torch.inference_mode():
@@ -1609,14 +1641,17 @@ def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id,
                 in_out_len = in_out.split("-")
                 in_len = int(in_out_len[0])
                 out_len = int(in_out_len[1])
-                input_str = get_continuation_input_str(in_len, tokenizer)
-                # As different tokenizer has different encodings,
-                # slice the input_ids to ensure the prompt length is required length.
-                input_ids = tokenizer.encode(input_str, return_tensors="pt")
-                input_ids = input_ids[:, :in_len]
-                true_str = tokenizer.batch_decode(input_ids)[0]
-                input_list = [true_str] * batch_size
-                input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
+                if repo_id not in DUMMY_IDS:
+                    input_str = get_continuation_input_str(in_len, tokenizer)
+                    # As different tokenizer has different encodings,
+                    # slice the input_ids to ensure the prompt length is required length.
+                    input_ids = tokenizer.encode(input_str, return_tensors="pt")
+                    input_ids = input_ids[:, :in_len]
+                    true_str = tokenizer.batch_decode(input_ids)[0]
+                    input_list = [true_str] * batch_size
+                    input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
+                else:
+                    input_ids = torch.randint(1000, 2000, [batch_size, in_len], dtype=torch.int64).to('xpu')
                 actual_in_len = input_ids.shape[1]
                 result[in_out] = []
                 for i in range(num_trials + warm_up):
@@ -1633,9 +1668,10 @@ def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id,
                     end = time.perf_counter()
                     output_ids = output_ids.cpu()
                     print("model generate cost: " + str(end - st))
-                    output = tokenizer.batch_decode(output_ids)
-                    if not streaming:
-                        print(output[0])
+                    if repo_id not in DUMMY_IDS:
+                        output = tokenizer.batch_decode(output_ids)
+                        if not streaming:
+                            print(output[0])
                     actual_out_len = output_ids.shape[1] - actual_in_len
                     if i >= warm_up:
                         result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
diff --git a/python/llm/test/benchmark/igpu-perf/1024-128.yaml b/python/llm/test/benchmark/igpu-perf/1024-128.yaml
index 759a7566237..c30cdebe3a3 100644
--- a/python/llm/test/benchmark/igpu-perf/1024-128.yaml
+++ b/python/llm/test/benchmark/igpu-perf/1024-128.yaml
@@ -19,6 +19,8 @@ repo_id:
   - 'microsoft/Phi-3-mini-128k-instruct'
   - 'microsoft/phi-3-vision-128k-instruct'
   - 'openbmb/MiniCPM-V-2_6'
+  - 'dummy/dummy-1.5B'
+  - 'dummy/dummy-4B'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
 num_trials: 3
diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
index f66172d9a39..60930214e8d 100644
--- a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
+++ b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
@@ -18,6 +18,8 @@ repo_id:
   - 'microsoft/Phi-3-mini-128k-instruct'
   - 'microsoft/phi-3-vision-128k-instruct'
   - 'openbmb/MiniCPM-V-2_6'
+  - 'dummy/dummy-1.5B'
+  - 'dummy/dummy-4B'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
 num_trials: 3
diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit.yaml
index 76c35d4dde7..8c831d633a1 100644
--- a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit.yaml
+++ b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit.yaml
@@ -17,6 +17,8 @@ repo_id:
   - 'microsoft/Phi-3-mini-4k-instruct'
   - 'microsoft/Phi-3-mini-128k-instruct'
   - 'microsoft/phi-3-vision-128k-instruct'
+  - 'dummy/dummy-1.5B'
+  - 'dummy/dummy-4B'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
 num_trials: 3
diff --git a/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16.yaml b/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16.yaml
index bf5fc1e978b..ebffb515f97 100644
--- a/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16.yaml
+++ b/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16.yaml
@@ -18,6 +18,8 @@ repo_id:
   - 'microsoft/Phi-3-mini-128k-instruct'
   - 'microsoft/phi-3-vision-128k-instruct'
   - 'openbmb/MiniCPM-V-2_6'
+  - 'dummy/dummy-1.5B'
+  - 'dummy/dummy-4B'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
 num_trials: 3
diff --git a/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16.yaml b/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16.yaml
index 60202594cba..3506b4054ad 100644
--- a/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16.yaml
+++ b/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16.yaml
@@ -17,6 +17,8 @@ repo_id:
   - 'microsoft/Phi-3-mini-128k-instruct'
   - 'microsoft/phi-3-vision-128k-instruct'
   - 'openbmb/MiniCPM-V-2_6'
+  - 'dummy/dummy-1.5B'
+  - 'dummy/dummy-4B'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
 num_trials: 3
diff --git a/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16.yaml b/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16.yaml
index e70178744a3..da0f27c8822 100644
--- a/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16.yaml
+++ b/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16.yaml
@@ -18,6 +18,8 @@ repo_id:
   - 'microsoft/Phi-3-mini-128k-instruct'
   - 'microsoft/phi-3-vision-128k-instruct'
   - 'openbmb/MiniCPM-V-2_6'
+  - 'dummy/dummy-1.5B'
+  - 'dummy/dummy-4B'
 local_model_hub: 'path to your local model hub'
 warm_up: 3
 num_trials: 5
diff --git a/python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16.yaml b/python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16.yaml
index 514037a7380..0d955d73298 100644
--- a/python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16.yaml
+++ b/python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16.yaml
@@ -17,6 +17,8 @@ repo_id:
   - 'microsoft/Phi-3-mini-128k-instruct'
   - 'microsoft/phi-3-vision-128k-instruct'
   - 'openbmb/MiniCPM-V-2_6'
+  - 'dummy/dummy-1.5B'
+  - 'dummy/dummy-4B'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
 num_trials: 3