Skip to content

Commit

Permalink
clean up
Browse files Browse the repository at this point in the history
  • Loading branch information
yangw1234 committed Jun 20, 2024
1 parent 6bbd7dc commit 9a31ea5
Show file tree
Hide file tree
Showing 2 changed files with 0 additions and 50 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,6 @@
npu_backend="openvino")

print(model)

from benchmark_util import BenchmarkWrapper

model = BenchmarkWrapper(model, do_print=True)

with torch.inference_mode():
input_ids = tokenizer.encode(args.prompt, return_tensors="pt")
Expand Down
46 changes: 0 additions & 46 deletions python/llm/src/ipex_llm/transformers/npu/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,49 +328,3 @@ def llama_model_forward(
transformers.models.llama.modeling_llama.LlamaModel,
llama_model_forward)
return model


import os

# if __name__ == "__main__":
# assert transformers.__version__ == "4.35.1"
# model_id = "D:\\llm-models\\TinyLlama-1.1B-Chat-v1.0"
# # model_id = "D:\\llm-models\\Llama-2-7b-chat-hf"

# tmpdirname = "D:\\yang\\profile\\tiny_llama_decoder"
# # tmpdirname = "D:\\yang\\profile\\llama_7b_decoder"
# os.makedirs(tmpdirname, exist_ok=True)

# model = AutoModelForCausalLM.from_pretrained(model_id, use_cache=True, torch_dtype=torch.float32, low_cpu_mem_usage=True).eval()
# # model.model.embed_tokens.to(torch.float32)
# # model.model.norm.to(torch.float32)
# # model.lm_head.to(torch.float32)
# tokenizer = AutoTokenizer.from_pretrained(model_id, use_default_system_prompt=True)
# tokenizer.padding_side = 'left'
# offload_llama_decoder_to_npu(22, 1024)
# query = "Once upon a time, there is a little girl named Lily who lives in a small village."
# tokens = tokenizer(query, return_tensors="pt")# , padding="max_length", max_length=20)

# prefix = tokens["input_ids"]
# print(tokens)

# from benchmark_util import BenchmarkWrapper

# model = BenchmarkWrapper(model, do_print=True)
# print(prefix.shape)

# generation_kwargs = dict(
# input_ids=prefix,
# # streamer=streamer,
# do_sample=False,
# top_k=50,
# top_p=0.9,
# max_new_tokens=16,
# )

# print("Run inference")
# for i in range(3):
# output = model.generate(**generation_kwargs)

# output_str = tokenizer.batch_decode(output)
# print(output_str)

0 comments on commit 9a31ea5

Please sign in to comment.