diff --git a/llama-box/patches/llama.cpp/cann.patch b/llama-box/patches/llama.cpp/cann.patch index 5d52756..d1a04ec 100644 --- a/llama-box/patches/llama.cpp/cann.patch +++ b/llama-box/patches/llama.cpp/cann.patch @@ -1,5 +1,5 @@ diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp -index bcb54e44..5045a27d 100644 +index 04e25b8a..92f2ff60 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -471,7 +471,10 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { @@ -91,7 +91,7 @@ index bcb54e44..5045a27d 100644 return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface, ctx, size); -@@ -1922,17 +1912,17 @@ struct ggml_backend_cann_device_context { +@@ -1911,17 +1901,17 @@ struct ggml_backend_cann_device_context { }; static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) { @@ -112,7 +112,7 @@ index bcb54e44..5045a27d 100644 ggml_backend_cann_get_device_memory(ctx->device, free, total); } -@@ -1959,7 +1949,7 @@ static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_back +@@ -1948,7 +1938,7 @@ static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_back static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, const char * params) { GGML_UNUSED(params); @@ -121,7 +121,7 @@ index bcb54e44..5045a27d 100644 return ggml_backend_cann_init(ctx->device); } -@@ -1979,7 +1969,7 @@ static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, cons +@@ -1968,7 +1958,7 @@ static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, cons static bool ggml_backend_cann_supports_buft( ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { if (ggml_backend_buft_is_cann(buft)) { @@ -130,7 +130,7 @@ index bcb54e44..5045a27d 100644 ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *)buft->context; return buft_ctx->device == dev_ctx->device; -@@ -1988,7 +1978,7 @@ static bool ggml_backend_cann_supports_buft( +@@ -1977,7 +1967,7 @@ static bool ggml_backend_cann_supports_buft( } static ggml_backend_buffer_type_t ggml_backend_cann_device_get_buffer_type(ggml_backend_dev_t dev) { @@ -139,7 +139,7 @@ index bcb54e44..5045a27d 100644 return ggml_backend_cann_buffer_type(ctx->device); } -@@ -2009,7 +1999,7 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type( +@@ -1998,7 +1988,7 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type( */ static ggml_backend_event_t ggml_backend_cann_device_event_new( ggml_backend_dev_t dev) { @@ -148,7 +148,7 @@ index bcb54e44..5045a27d 100644 ggml_cann_set_device(dev_ctx->device); -@@ -2117,11 +2107,7 @@ ggml_backend_reg_t ggml_backend_cann_reg() { +@@ -2106,11 +2096,7 @@ ggml_backend_reg_t ggml_backend_cann_reg() { ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context; for (int i = 0; i < ggml_cann_info().device_count; i++) { @@ -161,7 +161,7 @@ index bcb54e44..5045a27d 100644 ggml_backend_dev_t dev = new ggml_backend_device { /* .iface = */ ggml_backend_cann_device_interface, /* .reg = */ ®, -@@ -2150,17 +2136,12 @@ ggml_backend_t ggml_backend_cann_init(int32_t device) { +@@ -2139,17 +2125,12 @@ ggml_backend_t ggml_backend_cann_init(int32_t device) { return nullptr; } diff --git a/llama-box/patches/llama.cpp/clip.patch b/llama-box/patches/llama.cpp/clip.patch index d76b674..dc2c37f 100644 --- a/llama-box/patches/llama.cpp/clip.patch +++ b/llama-box/patches/llama.cpp/clip.patch @@ -1,5 +1,5 @@ diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp -index aae49c96..e6060d82 100644 +index 7ba4cea5..3def24e1 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -24,6 +24,10 @@ @@ -13,7 +13,26 @@ index aae49c96..e6060d82 100644 #define STB_IMAGE_IMPLEMENTATION #include "stb_image.h" -@@ -1162,6 +1166,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { +@@ -40,17 +44,7 @@ + #include + #include + +-#if defined(LLAVA_LOG_OFF) +-# define LOG_INF(...) +-# define LOG_WRN(...) +-# define LOG_ERR(...) +-# define LOG_DBG(...) +-#else // defined(LLAVA_LOG_OFF) +-# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0) +-# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +-# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +-# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0) +-#endif // defined(LLAVA_LOG_OFF) ++#include "common/log.h" + + //#define CLIP_DEBUG_FUNCTIONS + +@@ -1169,6 +1163,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { LOG_INF("%s: CLIP using Vulkan backend\n", __func__); #endif @@ -26,30 +45,29 @@ index aae49c96..e6060d82 100644 new_clip->backend = ggml_backend_cpu_init(); LOG_INF("%s: CLIP using CPU backend\n", __func__); diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp -index be698854..448954b1 100644 +index 4ca53a0b..924ad064 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp -@@ -2,6 +2,7 @@ - #include "llava.h" +@@ -11,17 +11,7 @@ + #include + #include - #include "llama.h" +-#if defined(LLAVA_LOG_OFF) +-# define LOG_INF(...) +-# define LOG_WRN(...) +-# define LOG_ERR(...) +-# define LOG_DBG(...) +-#else // defined(LLAVA_LOG_OFF) +-# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0) +-# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +-# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +-# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0) +-#endif // defined(LLAVA_LOG_OFF) +#include "common/log.h" - #include - #include -@@ -14,11 +15,6 @@ - #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0) - #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0) - --#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0) --#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0) --#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0) --#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0) -- // RGB uint8 image struct clip_image_u8 { - int nx; -@@ -362,7 +358,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli +@@ -366,7 +356,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli const int64_t t_img_enc_end_us = ggml_time_us(); float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0; diff --git a/llama-box/patches/llama.cpp/embedding.patch b/llama-box/patches/llama.cpp/embedding.patch index c773860..ce227ac 100644 --- a/llama-box/patches/llama.cpp/embedding.patch +++ b/llama-box/patches/llama.cpp/embedding.patch @@ -12,10 +12,10 @@ index ab5e376e..658fd56a 100644 } #endif diff --git a/src/llama.cpp b/src/llama.cpp -index af5e686e..7fc47b78 100644 +index 22b951ba..72bcce69 100644 --- a/src/llama.cpp +++ b/src/llama.cpp -@@ -19693,10 +19693,10 @@ struct llama_context * llama_new_context_with_model( +@@ -19698,10 +19698,10 @@ struct llama_context * llama_new_context_with_model( cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; // this is necessary due to kv_self.n being padded later during inference @@ -28,7 +28,7 @@ index af5e686e..7fc47b78 100644 // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext) -@@ -19706,7 +19706,7 @@ struct llama_context * llama_new_context_with_model( +@@ -19711,7 +19711,7 @@ struct llama_context * llama_new_context_with_model( cparams.n_batch = GGML_KQ_MASK_PAD; } @@ -37,7 +37,7 @@ index af5e686e..7fc47b78 100644 cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx : hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn : -@@ -22313,3 +22313,7 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void * +@@ -22318,3 +22318,7 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void * fputs(text, stderr); fflush(stderr); } diff --git a/llama-box/patches/llama.cpp/model.patch b/llama-box/patches/llama.cpp/model.patch index 0fbc324..2f51f6a 100644 --- a/llama-box/patches/llama.cpp/model.patch +++ b/llama-box/patches/llama.cpp/model.patch @@ -1,5 +1,5 @@ diff --git a/src/llama.cpp b/src/llama.cpp -index af5e686e..2d70ebcb 100644 +index 22b951ba..012c547e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -158,6 +158,7 @@ enum llm_arch { @@ -42,7 +42,7 @@ index af5e686e..2d70ebcb 100644 { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, -@@ -5610,6 +5629,17 @@ static void llm_load_hparams( +@@ -5612,6 +5631,17 @@ static void llm_load_hparams( model.type = e_model::MODEL_335M; break; // bge-large } } break; @@ -60,7 +60,7 @@ index af5e686e..2d70ebcb 100644 case LLM_ARCH_JINA_BERT_V2: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); -@@ -7849,13 +7879,14 @@ static bool llm_load_tensors( +@@ -7854,13 +7884,14 @@ static bool llm_load_tensors( } } break; case LLM_ARCH_BERT: @@ -77,7 +77,7 @@ index af5e686e..2d70ebcb 100644 model.cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); model.cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); -@@ -7870,8 +7901,8 @@ static bool llm_load_tensors( +@@ -7875,8 +7906,8 @@ static bool llm_load_tensors( for (int i = 0; i < n_layer; ++i) { auto & layer = model.layers[i]; @@ -88,7 +88,7 @@ index af5e686e..2d70ebcb 100644 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0); layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0); -@@ -7881,6 +7912,7 @@ static bool llm_load_tensors( +@@ -7886,6 +7917,7 @@ static bool llm_load_tensors( layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0); } else { layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0); @@ -96,7 +96,7 @@ index af5e686e..2d70ebcb 100644 } layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); -@@ -7888,12 +7920,16 @@ static bool llm_load_tensors( +@@ -7893,12 +7925,16 @@ static bool llm_load_tensors( layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0); @@ -116,7 +116,7 @@ index af5e686e..2d70ebcb 100644 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0); } else { layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); -@@ -11693,7 +11729,7 @@ struct llm_build_context { +@@ -11698,7 +11734,7 @@ struct llm_build_context { // token types are hardcoded to zero ("Sentence A") struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0); inpL = ggml_add(ctx0, inpL, type_row0); @@ -125,7 +125,7 @@ index af5e686e..2d70ebcb 100644 inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL); } cb(inpL, "inp_embd", -1); -@@ -11714,7 +11750,7 @@ struct llm_build_context { +@@ -11719,7 +11755,7 @@ struct llm_build_context { struct ggml_tensor * Vcur; // self-attention @@ -134,7 +134,7 @@ index af5e686e..2d70ebcb 100644 Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur), model.layers[il].bq); cb(Qcur, "Qcur", il); -@@ -11744,6 +11780,11 @@ struct llm_build_context { +@@ -11749,6 +11785,11 @@ struct llm_build_context { cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); @@ -146,7 +146,7 @@ index af5e686e..2d70ebcb 100644 Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); -@@ -11829,6 +11870,13 @@ struct llm_build_context { +@@ -11834,6 +11875,13 @@ struct llm_build_context { model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); @@ -160,7 +160,7 @@ index af5e686e..2d70ebcb 100644 } else if (model.arch == LLM_ARCH_JINA_BERT_V2) { cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, NULL, NULL, -@@ -16702,6 +16750,7 @@ static struct ggml_cgraph * llama_build_graph( +@@ -16707,6 +16755,7 @@ static struct ggml_cgraph * llama_build_graph( result = llm.build_refact(); } break; case LLM_ARCH_BERT: @@ -168,7 +168,7 @@ index af5e686e..2d70ebcb 100644 case LLM_ARCH_JINA_BERT_V2: case LLM_ARCH_NOMIC_BERT: { -@@ -20078,6 +20127,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { +@@ -20083,6 +20132,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_GROK: case LLM_ARCH_DBRX: case LLM_ARCH_BERT: diff --git a/llama-box/patches/llama.cpp/template.patch b/llama-box/patches/llama.cpp/template.patch index d4e39c6..eb3fc89 100644 --- a/llama-box/patches/llama.cpp/template.patch +++ b/llama-box/patches/llama.cpp/template.patch @@ -1,8 +1,8 @@ diff --git a/src/llama.cpp b/src/llama.cpp -index af5e686e..193c447a 100644 +index 22b951ba..b8c1810c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp -@@ -22118,6 +22118,26 @@ static int32_t llama_chat_apply_template_internal( +@@ -22123,6 +22123,26 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "<|start_of_role|>assistant<|end_of_role|>\n"; } diff --git a/llama-box/patches/llama.cpp/token.patch b/llama-box/patches/llama.cpp/token.patch index b35ebff..fe8ad3b 100644 --- a/llama-box/patches/llama.cpp/token.patch +++ b/llama-box/patches/llama.cpp/token.patch @@ -38,10 +38,10 @@ index 4bb16d2e..528f0291 100644 llama_token llama_token_prefix_impl(const struct llama_vocab & vocab); llama_token llama_token_middle_impl(const struct llama_vocab & vocab); diff --git a/src/llama.cpp b/src/llama.cpp -index af5e686e..b8f4b271 100644 +index 22b951ba..f3b04160 100644 --- a/src/llama.cpp +++ b/src/llama.cpp -@@ -21754,6 +21754,10 @@ llama_token llama_token_pad(const struct llama_model * model) { +@@ -21759,6 +21759,10 @@ llama_token llama_token_pad(const struct llama_model * model) { return llama_token_pad_impl(model->vocab); } diff --git a/llama-box/patches/llama.cpp/tokenizer.patch b/llama-box/patches/llama.cpp/tokenizer.patch index 1436391..6e1aacd 100644 --- a/llama-box/patches/llama.cpp/tokenizer.patch +++ b/llama-box/patches/llama.cpp/tokenizer.patch @@ -1,8 +1,8 @@ diff --git a/src/llama.cpp b/src/llama.cpp -index af5e686e..e1c3fef9 100644 +index 22b951ba..a5c30a00 100644 --- a/src/llama.cpp +++ b/src/llama.cpp -@@ -6296,16 +6296,7 @@ static void llm_load_vocab( +@@ -6301,16 +6301,7 @@ static void llm_load_vocab( if (vocab.type == LLAMA_VOCAB_TYPE_BPE) { vocab.tokenizer_add_space_prefix = false; vocab.tokenizer_clean_spaces = true; @@ -20,7 +20,7 @@ index af5e686e..e1c3fef9 100644 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } else if ( tokenizer_pre == "llama3" || -@@ -6407,7 +6398,8 @@ static void llm_load_vocab( +@@ -6412,7 +6403,8 @@ static void llm_load_vocab( vocab.tokenizer_add_bos = true; vocab.tokenizer_clean_spaces = false; } else {