From 36ef50372f03c53ea48ec4ea6a86c84fb2b2afa6 Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Mon, 15 Apr 2024 06:53:22 +0200
Subject: [PATCH] Addressed comments

---
 ..._cpp_plugin_with_qwen.ipynb => qwen.ipynb} | 33 +++++++++++--------
 1 file changed, 19 insertions(+), 14 deletions(-)
 rename modules/llama_cpp_plugin/notebooks/{llama_cpp_plugin_with_qwen.ipynb => qwen.ipynb} (90%)

diff --git a/modules/llama_cpp_plugin/notebooks/llama_cpp_plugin_with_qwen.ipynb b/modules/llama_cpp_plugin/notebooks/qwen.ipynb
similarity index 90%
rename from modules/llama_cpp_plugin/notebooks/llama_cpp_plugin_with_qwen.ipynb
rename to modules/llama_cpp_plugin/notebooks/qwen.ipynb
index 4163faf76..f906b4f62 100644
--- a/modules/llama_cpp_plugin/notebooks/llama_cpp_plugin_with_qwen.ipynb
+++ b/modules/llama_cpp_plugin/notebooks/qwen.ipynb
@@ -1,5 +1,13 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "b2cb321e-2c20-45ea-93e0-a3bd16e5a120",
+   "metadata": {},
+   "source": [
+    "## QWEN model inference w/ OpenVINO's LLAMA_CPP plugin"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "2bb9b46a-d9c5-42dc-8e50-6700180aad0c",
@@ -25,10 +33,7 @@
    "source": [
     "!pip install transformers[torch]\n",
     "!pip install tiktoken\n",
-    "!git clone https://github.com/ggerganov/llama.cpp\n",
-    "!pip install -r llama.cpp/requirements/requirements-convert-hf-to-gguf.txt\n",
-    "!huggingface-cli download \"Qwen/Qwen-7B-Chat\" --local-dir qwen-7b-chat\n",
-    "!python3 llama.cpp/convert-hf-to-gguf.py qwen-7b-chat --outtype f32 --outfile qwen_7b_chat.gguf"
+    "!huggingface-cli download Qwen/Qwen1.5-7B-Chat-GGUF qwen1_5-7b-chat-q5_k_m.gguf --local-dir . --local-dir-use-symlinks False"
    ]
   },
   {
@@ -51,7 +56,7 @@
     "!git clone https://github.com/openvinotoolkit/openvino_contrib\n",
     "!git clone --recurse-submodules https://github.com/openvinotoolkit/openvino\n",
     "\n",
-    "# Add -DLLAMA_CUBLAS=1 to the cmake line below build the plugin with the CUDA backend.\n",
+    "# Add -DLLAMA_CUBLAS=1 to the cmake line below to build the plugin with the CUDA backend.\n",
     "# The underlying llama.cpp inference code will be executed on CUDA-powered GPUs on your host.\n",
     "!cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=../openvino_contrib/modules/llama_cpp_plugin -DENABLE_PLUGINS_XML=ON -DENABLE_LLAMA_CPP_PLUGIN_REGISTRATION=ON -DENABLE_PYTHON=1 -DENABLE_WHEEL=ON openvino #-DLLAMA_CUBLAS=1\n",
     "\n",
@@ -95,7 +100,7 @@
    "outputs": [],
    "source": [
     "import openvino as ov\n",
-    "ov_model = ov.Core().compile_model(\"qwen_7b_chat.gguf\", \"LLAMA_CPP\")"
+    "ov_model = ov.Core().compile_model(\"qwen1_5-7b-chat-q5_k_m.gguf\", \"LLAMA_CPP\")"
    ]
   },
   {
@@ -184,9 +189,9 @@
     "formatted_input_prompt = convert_history([[user_prompt, \"\"]])\n",
     "\n",
     "from transformers import AutoTokenizer\n",
-    "tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
     "\n",
-    "initial_prompt_tokens = tok(formatted_input_prompt, return_tensors=\"np\", **tokenizer_kwargs).input_ids"
+    "initial_prompt_tokens = tokenizer(formatted_input_prompt, return_tensors=\"np\", **tokenizer_kwargs).input_ids"
    ]
   },
   {
@@ -218,18 +223,19 @@
     "\n",
     "output = ov_model({\"input_ids\": initial_prompt_tokens, \"position_ids\": position_ids})\n",
     "logits = output[\"logits\"]\n",
-    "last_token_id = np.argmax(logits[:, -1, :], axis=1).reshape([1, 1])\n",
+    "curr_token_ids = np.argmax(logits[:, -1, :], axis=1).reshape([1, 1])\n",
     "\n",
     "MAX_TOKENS_GENERATED = 256\n",
-    "STOP_TOKENS = [tok(st, return_tensors=\"np\").input_ids[0][0] for st in model_configuration[\"stop_tokens\"]]\n",
+    "STOP_TOKENS = [tokenizer(st, return_tensors=\"np\").input_ids[0][0] for st in model_configuration[\"stop_tokens\"]]\n",
     "\n",
     "curr_tokens_generated = 0\n",
-    "curr_token_ids = last_token_id.reshape([1, 1])\n",
+    "last_token_id = curr_token_ids[0][0]\n",
     "\n",
     "response_tokens = []\n",
     "next_position_id = sequence_length - 1\n",
     "\n",
-    "while (curr_token_ids[0][0] not in STOP_TOKENS) and (curr_tokens_generated < MAX_TOKENS_GENERATED):\n",
+    "while (last_token_id not in STOP_TOKENS) and (curr_tokens_generated < MAX_TOKENS_GENERATED):    \n",
+    "    print(tokenizer.decode(last_token_id), end='')\n",
     "    curr_tokens_generated += 1\n",
     "    curr_position_ids = np.ndarray([1, 1], dtype=np.int64)\n",
     "    curr_position_ids[0][0] = next_position_id    \n",
@@ -237,8 +243,7 @@
     "    curr_generated_output = ov_model({\"input_ids\": curr_token_ids, \"position_ids\": curr_position_ids})\n",
     "    curr_logits = curr_generated_output[\"logits\"]\n",
     "    curr_token_ids = np.argmax(curr_logits[:, -1, :], axis=1).reshape([1, 1])\n",
-    "    print(tok.decode(curr_token_ids[0][0]), end='')\n",
-    "    response_tokens.append(curr_token_ids)\n",
+    "    last_token_id = curr_token_ids[0][0]\n",
     "\n",
     "ov_model.create_infer_request().reset_state()"
    ]