From 36ef50372f03c53ea48ec4ea6a86c84fb2b2afa6 Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Mon, 15 Apr 2024 06:53:22 +0200 Subject: [PATCH] Addressed comments --- ..._cpp_plugin_with_qwen.ipynb => qwen.ipynb} | 33 +++++++++++-------- 1 file changed, 19 insertions(+), 14 deletions(-) rename modules/llama_cpp_plugin/notebooks/{llama_cpp_plugin_with_qwen.ipynb => qwen.ipynb} (90%) diff --git a/modules/llama_cpp_plugin/notebooks/llama_cpp_plugin_with_qwen.ipynb b/modules/llama_cpp_plugin/notebooks/qwen.ipynb similarity index 90% rename from modules/llama_cpp_plugin/notebooks/llama_cpp_plugin_with_qwen.ipynb rename to modules/llama_cpp_plugin/notebooks/qwen.ipynb index 4163faf76..f906b4f62 100644 --- a/modules/llama_cpp_plugin/notebooks/llama_cpp_plugin_with_qwen.ipynb +++ b/modules/llama_cpp_plugin/notebooks/qwen.ipynb @@ -1,5 +1,13 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "b2cb321e-2c20-45ea-93e0-a3bd16e5a120", + "metadata": {}, + "source": [ + "## QWEN model inference w/ OpenVINO's LLAMA_CPP plugin" + ] + }, { "cell_type": "markdown", "id": "2bb9b46a-d9c5-42dc-8e50-6700180aad0c", @@ -25,10 +33,7 @@ "source": [ "!pip install transformers[torch]\n", "!pip install tiktoken\n", - "!git clone https://github.com/ggerganov/llama.cpp\n", - "!pip install -r llama.cpp/requirements/requirements-convert-hf-to-gguf.txt\n", - "!huggingface-cli download \"Qwen/Qwen-7B-Chat\" --local-dir qwen-7b-chat\n", - "!python3 llama.cpp/convert-hf-to-gguf.py qwen-7b-chat --outtype f32 --outfile qwen_7b_chat.gguf" + "!huggingface-cli download Qwen/Qwen1.5-7B-Chat-GGUF qwen1_5-7b-chat-q5_k_m.gguf --local-dir . --local-dir-use-symlinks False" ] }, { @@ -51,7 +56,7 @@ "!git clone https://github.com/openvinotoolkit/openvino_contrib\n", "!git clone --recurse-submodules https://github.com/openvinotoolkit/openvino\n", "\n", - "# Add -DLLAMA_CUBLAS=1 to the cmake line below build the plugin with the CUDA backend.\n", + "# Add -DLLAMA_CUBLAS=1 to the cmake line below to build the plugin with the CUDA backend.\n", "# The underlying llama.cpp inference code will be executed on CUDA-powered GPUs on your host.\n", "!cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=../openvino_contrib/modules/llama_cpp_plugin -DENABLE_PLUGINS_XML=ON -DENABLE_LLAMA_CPP_PLUGIN_REGISTRATION=ON -DENABLE_PYTHON=1 -DENABLE_WHEEL=ON openvino #-DLLAMA_CUBLAS=1\n", "\n", @@ -95,7 +100,7 @@ "outputs": [], "source": [ "import openvino as ov\n", - "ov_model = ov.Core().compile_model(\"qwen_7b_chat.gguf\", \"LLAMA_CPP\")" + "ov_model = ov.Core().compile_model(\"qwen1_5-7b-chat-q5_k_m.gguf\", \"LLAMA_CPP\")" ] }, { @@ -184,9 +189,9 @@ "formatted_input_prompt = convert_history([[user_prompt, \"\"]])\n", "\n", "from transformers import AutoTokenizer\n", - "tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n", "\n", - "initial_prompt_tokens = tok(formatted_input_prompt, return_tensors=\"np\", **tokenizer_kwargs).input_ids" + "initial_prompt_tokens = tokenizer(formatted_input_prompt, return_tensors=\"np\", **tokenizer_kwargs).input_ids" ] }, { @@ -218,18 +223,19 @@ "\n", "output = ov_model({\"input_ids\": initial_prompt_tokens, \"position_ids\": position_ids})\n", "logits = output[\"logits\"]\n", - "last_token_id = np.argmax(logits[:, -1, :], axis=1).reshape([1, 1])\n", + "curr_token_ids = np.argmax(logits[:, -1, :], axis=1).reshape([1, 1])\n", "\n", "MAX_TOKENS_GENERATED = 256\n", - "STOP_TOKENS = [tok(st, return_tensors=\"np\").input_ids[0][0] for st in model_configuration[\"stop_tokens\"]]\n", + "STOP_TOKENS = [tokenizer(st, return_tensors=\"np\").input_ids[0][0] for st in model_configuration[\"stop_tokens\"]]\n", "\n", "curr_tokens_generated = 0\n", - "curr_token_ids = last_token_id.reshape([1, 1])\n", + "last_token_id = curr_token_ids[0][0]\n", "\n", "response_tokens = []\n", "next_position_id = sequence_length - 1\n", "\n", - "while (curr_token_ids[0][0] not in STOP_TOKENS) and (curr_tokens_generated < MAX_TOKENS_GENERATED):\n", + "while (last_token_id not in STOP_TOKENS) and (curr_tokens_generated < MAX_TOKENS_GENERATED): \n", + " print(tokenizer.decode(last_token_id), end='')\n", " curr_tokens_generated += 1\n", " curr_position_ids = np.ndarray([1, 1], dtype=np.int64)\n", " curr_position_ids[0][0] = next_position_id \n", @@ -237,8 +243,7 @@ " curr_generated_output = ov_model({\"input_ids\": curr_token_ids, \"position_ids\": curr_position_ids})\n", " curr_logits = curr_generated_output[\"logits\"]\n", " curr_token_ids = np.argmax(curr_logits[:, -1, :], axis=1).reshape([1, 1])\n", - " print(tok.decode(curr_token_ids[0][0]), end='')\n", - " response_tokens.append(curr_token_ids)\n", + " last_token_id = curr_token_ids[0][0]\n", "\n", "ov_model.create_infer_request().reset_state()" ]