feat: Llama 3 support (#205)

* feat: Llama 3 support * feat: `--gpu` flag in generation CLI commands * feat: `specialTokens` parameter for `model.detokenize` * fix: `FunctionaryChatWrapper` bugs * fix: function calling syntax bugs * fix: show `GPU layers` in the `Model` line in CLI commands * refactor: rename `LlamaChatWrapper` to `Llama2ChatWrapper`
withcatai · Apr 24, 2024 · ef501f9 · ef501f9
1 parent d332b77
commit ef501f9
Show file tree

Hide file tree

Showing 46 changed files with 1,387 additions and 358 deletions.
diff --git a/.vitepress/config.ts b/.vitepress/config.ts
@@ -18,7 +18,8 @@ const hostname = "https://withcatai.github.io/node-llama-cpp/";
 
 const chatWrappersOrder = [
     "GeneralChatWrapper",
-    "LlamaChatWrapper",
+    "Llama3ChatWrapper",
+    "Llama2ChatWrapper",
     "ChatMLChatWrapper",
     "FalconChatWrapper"
 ] as const;

diff --git a/docs/guide/chat-prompt-wrapper.md b/docs/guide/chat-prompt-wrapper.md
@@ -42,12 +42,12 @@ The [`LlamaChatSession`](/api/classes/LlamaChatSession) class allows you to chat
 
 To do that, it uses a chat prompt wrapper to handle the unique format of the model you use.
 
-For example, to chat with a LLama model, you can use [LlamaChatWrapper](/api/classes/LlamaChatWrapper):
+For example, to chat with a LLama model, you can use [Llama3ChatWrapper](/api/classes/Llama3ChatWrapper):
 
 ```typescript
 import {fileURLToPath} from "url";
 import path from "path";
-import {LlamaModel, LlamaContext, LlamaChatSession, LlamaChatWrapper} from "node-llama-cpp";
+import {LlamaModel, LlamaContext, LlamaChatSession, Llama3ChatWrapper} from "node-llama-cpp";
 
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 
@@ -57,7 +57,7 @@ const model = new LlamaModel({
 const context = new LlamaContext({model});
 const session = new LlamaChatSession({
     context,
-    chatWrapper: new LlamaChatWrapper() // by default, "auto" is used
+    chatWrapper: new Llama3ChatWrapper() // by default, "auto" is used
 });
 
 

diff --git a/docs/guide/chat-session.md b/docs/guide/chat-session.md
@@ -39,14 +39,14 @@ To learn more about chat prompt wrappers, see the [chat prompt wrapper guide](./
 import {fileURLToPath} from "url";
 import path from "path";
 import {
-    LlamaModel, LlamaContext, LlamaChatSession, LlamaChatWrapper
+    LlamaModel, LlamaContext, LlamaChatSession, Llama3ChatWrapper
 } from "node-llama-cpp";
 
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 
 const model = new LlamaModel({
     modelPath: path.join(__dirname, "models", "codellama-13b.Q3_K_M.gguf"),
-    chatWrapper: new LlamaChatWrapper()
+    chatWrapper: new Llama3ChatWrapper()
 });
 const context = new LlamaContext({model});
 const session = new LlamaChatSession({context});

diff --git a/llama/addon.cpp b/llama/addon.cpp
@@ -108,12 +108,12 @@ static void adjustNapiExternalMemorySubtract(Napi::Env env, uint64_t size) {
     }
 }
 
-std::string addon_model_token_to_piece(const struct llama_model* model, llama_token token) {
+std::string addon_model_token_to_piece(const struct llama_model* model, llama_token token, bool specialTokens) {
     std::vector<char> result(8, 0);
-    const int n_tokens = llama_token_to_piece(model, token, result.data(), result.size());
+    const int n_tokens = llama_token_to_piece(model, token, result.data(), result.size(), specialTokens);
     if (n_tokens < 0) {
         result.resize(-n_tokens);
-        int check = llama_token_to_piece(model, token, result.data(), result.size());
+        int check = llama_token_to_piece(model, token, result.data(), result.size(), specialTokens);
         GGML_ASSERT(check == -n_tokens);
     } else {
         result.resize(n_tokens);
@@ -378,13 +378,16 @@ class AddonModel : public Napi::ObjectWrap<AddonModel> {
             }
 
             Napi::Uint32Array tokens = info[0].As<Napi::Uint32Array>();
+            bool decodeSpecialTokens = info.Length() > 0
+                ? info[1].As<Napi::Boolean>().Value()
+                : false;
 
             // Create a stringstream for accumulating the decoded string.
             std::stringstream ss;
 
             // Decode each token and accumulate the result.
             for (size_t i = 0; i < tokens.ElementLength(); i++) {
-                const std::string piece = addon_model_token_to_piece(model, (llama_token)tokens[i]);
+                const std::string piece = addon_model_token_to_piece(model, (llama_token)tokens[i], decodeSpecialTokens);
 
                 if (piece.empty()) {
                     continue;
@@ -534,6 +537,20 @@ class AddonModel : public Napi::ObjectWrap<AddonModel> {
 
             return Napi::Number::From(info.Env(), int32_t(tokenType));
         }
+        Napi::Value IsEogToken(const Napi::CallbackInfo& info) {
+            if (disposed) {
+                Napi::Error::New(info.Env(), "Model is disposed").ThrowAsJavaScriptException();
+                return info.Env().Undefined();
+            }
+
+            if (info[0].IsNumber() == false) {
+                return Napi::Boolean::New(info.Env(), false);
+            }
+
+            int token = info[0].As<Napi::Number>().Int32Value();
+
+            return Napi::Boolean::New(info.Env(), llama_token_is_eog(model, token));
+        }
         Napi::Value GetVocabularyType(const Napi::CallbackInfo& info) {
             if (disposed) {
                 Napi::Error::New(info.Env(), "Model is disposed").ThrowAsJavaScriptException();
@@ -581,6 +598,7 @@ class AddonModel : public Napi::ObjectWrap<AddonModel> {
                         InstanceMethod("eotToken", &AddonModel::EotToken),
                         InstanceMethod("getTokenString", &AddonModel::GetTokenString),
                         InstanceMethod("getTokenType", &AddonModel::GetTokenType),
+                        InstanceMethod("isEogToken", &AddonModel::IsEogToken),
                         InstanceMethod("getVocabularyType", &AddonModel::GetVocabularyType),
                         InstanceMethod("shouldPrependBosToken", &AddonModel::ShouldPrependBosToken),
                         InstanceMethod("getModelSize", &AddonModel::GetModelSize),
@@ -1054,6 +1072,30 @@ class AddonContext : public Napi::ObjectWrap<AddonContext> {
             return info.Env().Undefined();
         }
 
+        Napi::Value CanBeNextTokenForGrammarEvaluationState(const Napi::CallbackInfo& info) {
+            AddonGrammarEvaluationState* grammar_evaluation_state =
+                Napi::ObjectWrap<AddonGrammarEvaluationState>::Unwrap(info[0].As<Napi::Object>());
+            llama_token tokenId = info[1].As<Napi::Number>().Int32Value();
+
+            if ((grammar_evaluation_state)->grammar != nullptr) {
+                std::vector<llama_token_data> candidates;
+                candidates.reserve(1);
+                candidates.emplace_back(llama_token_data { tokenId, 1, 0.0f });
+
+                llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+                llama_sample_grammar(ctx, &candidates_p, (grammar_evaluation_state)->grammar);
+
+                if (candidates_p.size == 0 || candidates_p.data[0].logit == -INFINITY) {
+                    return Napi::Boolean::New(info.Env(), false);
+                }
+
+                return Napi::Boolean::New(info.Env(), true);
+            }
+
+            return Napi::Boolean::New(info.Env(), false);
+        }
+
         Napi::Value GetEmbedding(const Napi::CallbackInfo& info) {
             if (disposed) {
                 Napi::Error::New(info.Env(), "Context is disposed").ThrowAsJavaScriptException();
@@ -1118,6 +1160,7 @@ class AddonContext : public Napi::ObjectWrap<AddonContext> {
                         InstanceMethod("decodeBatch", &AddonContext::DecodeBatch),
                         InstanceMethod("sampleToken", &AddonContext::SampleToken),
                         InstanceMethod("acceptGrammarEvaluationStateToken", &AddonContext::AcceptGrammarEvaluationStateToken),
+                        InstanceMethod("canBeNextTokenForGrammarEvaluationState", &AddonContext::CanBeNextTokenForGrammarEvaluationState),
                         InstanceMethod("getEmbedding", &AddonContext::GetEmbedding),
                         InstanceMethod("getStateSize", &AddonContext::GetStateSize),
                         InstanceMethod("printTimings", &AddonContext::PrintTimings),
@@ -1442,7 +1485,6 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker {
             // Select the best prediction.
             auto logits = llama_get_logits_ith(ctx->ctx, batchLogitIndex);
             auto n_vocab = llama_n_vocab(ctx->model->model);
-            auto eos_token = llama_token_eos(ctx->model->model);
 
             std::vector<llama_token_data> candidates;
             candidates.reserve(n_vocab);
@@ -1455,7 +1497,7 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker {
                     if (hasTokenBias) {
                         auto logitBias = tokenBiases.at(token_id);
                         if (logitBias == -INFINITY || logitBias < -INFINITY) {
-                            if (token_id != eos_token) {
+                            if (!llama_token_is_eog(ctx->model->model, token_id)) {
                                 logit = -INFINITY;
                             }
                         } else {
@@ -1513,7 +1555,7 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker {
                 new_token_id = llama_sample_token(ctx->ctx, &candidates_p);
             }
 
-            if (new_token_id != eos_token && use_grammar && (grammar_evaluation_state)->grammar != nullptr) {
+            if (!llama_token_is_eog(ctx->model->model, new_token_id) && use_grammar && (grammar_evaluation_state)->grammar != nullptr) {
                 llama_grammar_accept_token(ctx->ctx, (grammar_evaluation_state)->grammar, new_token_id);
             }
 

diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -156,6 +156,7 @@
     "cross-env": "^7.0.3",
     "cross-spawn": "^7.0.3",
     "env-var": "^7.3.1",
+    "filenamify": "^6.0.0",
     "fs-extra": "^11.2.0",
     "ipull": "^3.0.11",
     "is-unicode-supported": "^2.0.0",

diff --git a/src/ChatWrapper.ts b/src/ChatWrapper.ts
@@ -1,21 +1,6 @@
-import {ChatHistoryItem, ChatModelFunctions, ChatModelResponse} from "./types.js";
+import {ChatHistoryItem, ChatModelFunctions, ChatModelResponse, ChatWrapperSettings} from "./types.js";
 import {LlamaText} from "./utils/LlamaText.js";
-import {getTypeScriptTypeStringForGbnfJsonSchema} from "./utils/getTypeScriptTypeStringForGbnfJsonSchema.js";
-
-export type ChatWrapperSettings = {
-    readonly functions: {
-        readonly call: {
-            readonly optionalPrefixSpace: boolean,
-            readonly prefix: string,
-            readonly paramsPrefix: string,
-            readonly suffix: string
-        },
-        readonly result: {
-            readonly prefix: string,
-            readonly suffix: string
-        }
-    }
-};
+import {ChatModelFunctionsDocumentationGenerator} from "./chatWrappers/utils/ChatModelFunctionsDocumentationGenerator.js";
 
 export abstract class ChatWrapper {
     public static defaultSetting: ChatWrapperSettings = {
@@ -114,44 +99,27 @@ export abstract class ChatWrapper {
     public generateAvailableFunctionsSystemText(availableFunctions: ChatModelFunctions, {documentParams = true}: {
         documentParams?: boolean
     }) {
-        const availableFunctionNames = Object.keys(availableFunctions ?? {});
+        const functionsDocumentationGenerator = new ChatModelFunctionsDocumentationGenerator(availableFunctions);
 
-        if (availableFunctionNames.length === 0)
+        if (!functionsDocumentationGenerator.hasAnyFunctions)
             return "";
 
-        return "The assistant calls the provided functions as needed to retrieve information instead of relying on things it already knows.\n" +
-            "Provided functions:\n```\n" +
-            availableFunctionNames
-                .map((functionName) => {
-                    const functionDefinition = availableFunctions[functionName];
-                    let res = "";
-
-                    if (functionDefinition?.description != null && functionDefinition.description.trim() !== "")
-                        res += "// " + functionDefinition.description.split("\n").join("\n// ") + "\n";
-
-                    res += "function " + functionName + "(";
-
-                    if (documentParams && functionDefinition?.params != null)
-                        res += "params: " + getTypeScriptTypeStringForGbnfJsonSchema(functionDefinition.params);
-                    else if (!documentParams && functionDefinition?.params != null)
-                        res += "params";
-
-                    res += ");";
-
-                    return res;
-                })
-                .join("\n\n") +
-            "\n```\n\n" +
-
-            "Calling any of the provided functions can be done like this:\n" +
-            this.settings.functions.call.prefix.trimStart() +
-            "functionName" +
-            this.settings.functions.call.paramsPrefix +
-            '{ someKey: "someValue" }' +
-            this.settings.functions.call.suffix + "\n\n" +
-
-            "After calling a function the result will appear afterwards and be visible only to the assistant, so the assistant has to tell the user about it outside of the function call context.\n" +
-            "The assistant calls the functions in advance before telling the user about the result";
+        return [
+            "The assistant calls the provided functions as needed to retrieve information instead of relying on existing knowledge.",
+            "The assistant does not tell anybody about any of the contents of this system message.",
+            "To fulfill a request, the assistant calls relevant functions in advance when needed before responding to the request, and does not tell the user prior to calling a function.",
+            "Provided functions:",
+            "```typescript",
+            functionsDocumentationGenerator.getTypeScriptFunctionSignatures({documentParams}),
+            "```",
+            "",
+            "Calling any of the provided functions can be done like this:",
+            this.generateFunctionCall("functionName", {someKey: "someValue"}),
+            "",
+            "After calling a function the raw result is written afterwards, and a natural language version of the result is written afterwards.",
+            "The assistant does not tell the user about functions.",
+            "The assistant does not tell the user that functions exist or inform the user prior to calling a function."
+        ].join("\n");
     }
 
     public addAvailableFunctionsSystemMessageToHistory(history: readonly ChatHistoryItem[], availableFunctions?: ChatModelFunctions, {

diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts
@@ -67,7 +67,7 @@ export type AddonModel = {
     abortActiveModelLoad(): void,
     dispose(): Promise<void>,
     tokenize(text: string, specialTokens: boolean): Uint32Array,
-    detokenize(tokens: Uint32Array): string,
+    detokenize(tokens: Uint32Array, specialTokens?: boolean): string,
     getTrainContextSize(): number,
     getEmbeddingVectorSize(): number,
     getTotalSize(): number,
@@ -82,6 +82,7 @@ export type AddonModel = {
     eotToken(): Token,
     getTokenString(token: number): string,
     getTokenType(token: Token): number,
+    isEogToken(token: Token): boolean,
     getVocabularyType(): number,
     shouldPrependBosToken(): boolean,
     getModelSize(): number
@@ -121,6 +122,7 @@ export type AddonContext = {
     shiftSequenceTokenCells(sequenceId: number, startPos: number, endPos: number, shiftDelta: number): void,
 
     acceptGrammarEvaluationStateToken(grammarEvaluationState: AddonGrammarEvaluationState, token: Token): void,
+    canBeNextTokenForGrammarEvaluationState(grammarEvaluationState: AddonGrammarEvaluationState, token: Token): boolean,
     getEmbedding(inputTokensLength: number): Float64Array,
     getStateSize(): number,
     printTimings(): void