Skip to content

Commit

Permalink
Addressed comments
Browse files Browse the repository at this point in the history
  • Loading branch information
vshampor committed Apr 15, 2024
1 parent 50dc316 commit 3246d0e
Showing 1 changed file with 19 additions and 14 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "b2cb321e-2c20-45ea-93e0-a3bd16e5a120",
"metadata": {},
"source": [
"## QWEN model inference w/ OpenVINO's LLAMA_CPP plugin"
]
},
{
"cell_type": "markdown",
"id": "2bb9b46a-d9c5-42dc-8e50-6700180aad0c",
Expand All @@ -25,10 +33,7 @@
"source": [
"!pip install transformers[torch]\n",
"!pip install tiktoken\n",
"!git clone https://github.com/ggerganov/llama.cpp\n",
"!pip install -r llama.cpp/requirements/requirements-convert-hf-to-gguf.txt\n",
"!huggingface-cli download \"Qwen/Qwen-7B-Chat\" --local-dir qwen-7b-chat\n",
"!python3 llama.cpp/convert-hf-to-gguf.py qwen-7b-chat --outtype f32 --outfile qwen_7b_chat.gguf"
"!huggingface-cli download Qwen/Qwen1.5-7B-Chat-GGUF qwen1_5-7b-chat-q5_k_m.gguf --local-dir . --local-dir-use-symlinks False"
]
},
{
Expand All @@ -51,7 +56,7 @@
"!git clone https://github.com/openvinotoolkit/openvino_contrib\n",
"!git clone --recurse-submodules https://github.com/openvinotoolkit/openvino\n",
"\n",
"# Add -DLLAMA_CUBLAS=1 to the cmake line below build the plugin with the CUDA backend.\n",
"# Add -DLLAMA_CUBLAS=1 to the cmake line below to build the plugin with the CUDA backend.\n",
"# The underlying llama.cpp inference code will be executed on CUDA-powered GPUs on your host.\n",
"!cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=../openvino_contrib/modules/llama_cpp_plugin -DENABLE_PLUGINS_XML=ON -DENABLE_LLAMA_CPP_PLUGIN_REGISTRATION=ON -DENABLE_PYTHON=1 -DENABLE_WHEEL=ON openvino #-DLLAMA_CUBLAS=1\n",
"\n",
Expand Down Expand Up @@ -95,7 +100,7 @@
"outputs": [],
"source": [
"import openvino as ov\n",
"ov_model = ov.Core().compile_model(\"qwen_7b_chat.gguf\", \"LLAMA_CPP\")"
"ov_model = ov.Core().compile_model(\"qwen1_5-7b-chat-q5_k_m.gguf\", \"LLAMA_CPP\")"
]
},
{
Expand Down Expand Up @@ -184,9 +189,9 @@
"formatted_input_prompt = convert_history([[user_prompt, \"\"]])\n",
"\n",
"from transformers import AutoTokenizer\n",
"tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
"\n",
"initial_prompt_tokens = tok(formatted_input_prompt, return_tensors=\"np\", **tokenizer_kwargs).input_ids"
"initial_prompt_tokens = tokenizer(formatted_input_prompt, return_tensors=\"np\", **tokenizer_kwargs).input_ids"
]
},
{
Expand Down Expand Up @@ -218,27 +223,27 @@
"\n",
"output = ov_model({\"input_ids\": initial_prompt_tokens, \"position_ids\": position_ids})\n",
"logits = output[\"logits\"]\n",
"last_token_id = np.argmax(logits[:, -1, :], axis=1).reshape([1, 1])\n",
"curr_token_ids = np.argmax(logits[:, -1, :], axis=1).reshape([1, 1])\n",
"\n",
"MAX_TOKENS_GENERATED = 256\n",
"STOP_TOKENS = [tok(st, return_tensors=\"np\").input_ids[0][0] for st in model_configuration[\"stop_tokens\"]]\n",
"STOP_TOKENS = [tokenizer(st, return_tensors=\"np\").input_ids[0][0] for st in model_configuration[\"stop_tokens\"]]\n",
"\n",
"curr_tokens_generated = 0\n",
"curr_token_ids = last_token_id.reshape([1, 1])\n",
"last_token_id = curr_token_ids[0][0]\n",
"\n",
"response_tokens = []\n",
"next_position_id = sequence_length - 1\n",
"\n",
"while (curr_token_ids[0][0] not in STOP_TOKENS) and (curr_tokens_generated < MAX_TOKENS_GENERATED):\n",
"while (last_token_id not in STOP_TOKENS) and (curr_tokens_generated < MAX_TOKENS_GENERATED): \n",
" print(tokenizer.decode(last_token_id), end='')\n",
" curr_tokens_generated += 1\n",
" curr_position_ids = np.ndarray([1, 1], dtype=np.int64)\n",
" curr_position_ids[0][0] = next_position_id \n",
" next_position_id += 1\n",
" curr_generated_output = ov_model({\"input_ids\": curr_token_ids, \"position_ids\": curr_position_ids})\n",
" curr_logits = curr_generated_output[\"logits\"]\n",
" curr_token_ids = np.argmax(curr_logits[:, -1, :], axis=1).reshape([1, 1])\n",
" print(tok.decode(curr_token_ids[0][0]), end='')\n",
" response_tokens.append(curr_token_ids)\n",
" last_token_id = curr_token_ids[0][0]\n",
"\n",
"ov_model.create_infer_request().reset_state()"
]
Expand Down

0 comments on commit 3246d0e

Please sign in to comment.