diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Model/tinyllama/generate.py b/python/llm/example/NPU/HF-Transformers-AutoModels/Model/tinyllama/generate.py index 605e3b85e21..44cc8e03f15 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/Model/tinyllama/generate.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Model/tinyllama/generate.py @@ -41,10 +41,6 @@ npu_backend="openvino") print(model) - - from benchmark_util import BenchmarkWrapper - - model = BenchmarkWrapper(model, do_print=True) with torch.inference_mode(): input_ids = tokenizer.encode(args.prompt, return_tensors="pt") diff --git a/python/llm/src/ipex_llm/transformers/npu/llama.py b/python/llm/src/ipex_llm/transformers/npu/llama.py index 7494803d22d..7263c80132f 100644 --- a/python/llm/src/ipex_llm/transformers/npu/llama.py +++ b/python/llm/src/ipex_llm/transformers/npu/llama.py @@ -328,49 +328,3 @@ def llama_model_forward( transformers.models.llama.modeling_llama.LlamaModel, llama_model_forward) return model - - -import os - -# if __name__ == "__main__": -# assert transformers.__version__ == "4.35.1" -# model_id = "D:\\llm-models\\TinyLlama-1.1B-Chat-v1.0" -# # model_id = "D:\\llm-models\\Llama-2-7b-chat-hf" - -# tmpdirname = "D:\\yang\\profile\\tiny_llama_decoder" -# # tmpdirname = "D:\\yang\\profile\\llama_7b_decoder" -# os.makedirs(tmpdirname, exist_ok=True) - -# model = AutoModelForCausalLM.from_pretrained(model_id, use_cache=True, torch_dtype=torch.float32, low_cpu_mem_usage=True).eval() -# # model.model.embed_tokens.to(torch.float32) -# # model.model.norm.to(torch.float32) -# # model.lm_head.to(torch.float32) -# tokenizer = AutoTokenizer.from_pretrained(model_id, use_default_system_prompt=True) -# tokenizer.padding_side = 'left' -# offload_llama_decoder_to_npu(22, 1024) -# query = "Once upon a time, there is a little girl named Lily who lives in a small village." -# tokens = tokenizer(query, return_tensors="pt")# , padding="max_length", max_length=20) - -# prefix = tokens["input_ids"] -# print(tokens) - -# from benchmark_util import BenchmarkWrapper - -# model = BenchmarkWrapper(model, do_print=True) -# print(prefix.shape) - -# generation_kwargs = dict( -# input_ids=prefix, -# # streamer=streamer, -# do_sample=False, -# top_k=50, -# top_p=0.9, -# max_new_tokens=16, -# ) - -# print("Run inference") -# for i in range(3): -# output = model.generate(**generation_kwargs) - -# output_str = tokenizer.batch_decode(output) -# print(output_str) \ No newline at end of file