From 0984ed8140b5e0978b588ec2e0e0f2b4ddfdeb0d Mon Sep 17 00:00:00 2001
From: Justine Tunney <jtunney@mozilla.com>
Date: Sun, 10 Dec 2023 10:36:29 -0800
Subject: [PATCH] Fix regression with --grammar flag

This code was incorrectly merged during 820d42d. It ended up causing
grammar rules to produce incoherent output. For example, the grammar
rule `--grammar 'root ::= [a-z]+ (" " [a-z]+)+'` would select tokens
without leading spaces. This change fixes that.

Fixes #118
---
 llama.cpp/llama.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama.cpp/llama.cpp b/llama.cpp/llama.cpp
index 8cfd5ee787..4fd62c8e78 100644
--- a/llama.cpp/llama.cpp
+++ b/llama.cpp/llama.cpp
@@ -7541,7 +7541,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
 
     for (size_t i = 0; i < candidates->size; ++i) {
         const llama_token id    = candidates->data[i].id;
-        const std::string & piece = ctx->model.vocab.id_to_token[id].text;
+        const std::string piece = llama_token_to_piece(ctx, id);
         if (id == eos) {
             if (!allow_eos) {
                 candidates->data[i].logit = -INFINITY;
@@ -7753,7 +7753,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
         GGML_ASSERT(false);
     }
 
-    const std::string & piece = ctx->model.vocab.id_to_token[token].text;
+    const std::string piece = llama_token_to_piece(ctx, token);
 
     // Note terminating 0 in decoded string
     const auto   decoded     = decode_utf8(piece, grammar->partial_utf8);
@@ -7867,7 +7867,7 @@ struct llama_beam_search_data {
     }
 
     // Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
-    // The repetative patterns below reflect the 2 stages of heaps:
+    // The repetitive patterns below reflect the 2 stages of heaps:
     //  * Gather elements until the vector is full, then call std::make_heap() on it.
     //  * If the heap is full and a new element is found that should be included, pop the
     //    least element to the back(), replace it with the new, then push it into the heap.