refactor: align upstream

Signed-off-by: thxCode <thxcode0824@gmail.com>
gpustack · Nov 29, 2024 · c8e0f6e · c8e0f6e
1 parent d1db90d
commit c8e0f6e
Show file tree

Hide file tree

Showing 7 changed files with 68 additions and 50 deletions.
diff --git a/llama-box/patches/llama.cpp/cann.patch b/llama-box/patches/llama.cpp/cann.patch
@@ -1,5 +1,5 @@
 diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
-index bcb54e44..5045a27d 100644
+index 04e25b8a..92f2ff60 100644
 --- a/ggml/src/ggml-cann/ggml-cann.cpp
 +++ b/ggml/src/ggml-cann/ggml-cann.cpp
 @@ -471,7 +471,10 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
@@ -91,7 +91,7 @@ index bcb54e44..5045a27d 100644
 
      return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface,
                                      ctx, size);
-@@ -1922,17 +1912,17 @@ struct ggml_backend_cann_device_context {
+@@ -1911,17 +1901,17 @@ struct ggml_backend_cann_device_context {
  };
 
  static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
@@ -112,7 +112,7 @@ index bcb54e44..5045a27d 100644
      ggml_backend_cann_get_device_memory(ctx->device, free, total);
  }
 
-@@ -1959,7 +1949,7 @@ static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_back
+@@ -1948,7 +1938,7 @@ static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_back
 
  static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, const char * params) {
      GGML_UNUSED(params);
@@ -121,7 +121,7 @@ index bcb54e44..5045a27d 100644
      return ggml_backend_cann_init(ctx->device);
  }
 
-@@ -1979,7 +1969,7 @@ static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, cons
+@@ -1968,7 +1958,7 @@ static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, cons
  static bool ggml_backend_cann_supports_buft(
      ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
      if (ggml_backend_buft_is_cann(buft)) {
@@ -130,7 +130,7 @@ index bcb54e44..5045a27d 100644
          ggml_backend_cann_buffer_type_context * buft_ctx =
                          (ggml_backend_cann_buffer_type_context *)buft->context;
          return buft_ctx->device == dev_ctx->device;
-@@ -1988,7 +1978,7 @@ static bool ggml_backend_cann_supports_buft(
+@@ -1977,7 +1967,7 @@ static bool ggml_backend_cann_supports_buft(
  }
 
  static ggml_backend_buffer_type_t ggml_backend_cann_device_get_buffer_type(ggml_backend_dev_t dev) {
@@ -139,7 +139,7 @@ index bcb54e44..5045a27d 100644
      return ggml_backend_cann_buffer_type(ctx->device);
  }
 
-@@ -2009,7 +1999,7 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(
+@@ -1998,7 +1988,7 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(
   */
  static ggml_backend_event_t ggml_backend_cann_device_event_new(
      ggml_backend_dev_t dev) {
@@ -148,7 +148,7 @@ index bcb54e44..5045a27d 100644
 
      ggml_cann_set_device(dev_ctx->device);
 
-@@ -2117,11 +2107,7 @@ ggml_backend_reg_t ggml_backend_cann_reg() {
+@@ -2106,11 +2096,7 @@ ggml_backend_reg_t ggml_backend_cann_reg() {
              ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
 
              for (int i = 0; i < ggml_cann_info().device_count; i++) {
@@ -161,7 +161,7 @@ index bcb54e44..5045a27d 100644
                  ggml_backend_dev_t dev = new ggml_backend_device {
                      /* .iface   = */ ggml_backend_cann_device_interface,
                      /* .reg     = */ &reg,
-@@ -2150,17 +2136,12 @@ ggml_backend_t ggml_backend_cann_init(int32_t device) {
+@@ -2139,17 +2125,12 @@ ggml_backend_t ggml_backend_cann_init(int32_t device) {
          return nullptr;
      }
 

diff --git a/llama-box/patches/llama.cpp/clip.patch b/llama-box/patches/llama.cpp/clip.patch
@@ -1,5 +1,5 @@
 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index aae49c96..e6060d82 100644
+index 7ba4cea5..3def24e1 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
 @@ -24,6 +24,10 @@
@@ -13,7 +13,26 @@ index aae49c96..e6060d82 100644
  #define STB_IMAGE_IMPLEMENTATION
  #include "stb_image.h"
 
-@@ -1162,6 +1166,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
+@@ -40,17 +44,7 @@
+ #include <cinttypes>
+ #include <limits>
+
+-#if defined(LLAVA_LOG_OFF)
+-#   define LOG_INF(...)
+-#   define LOG_WRN(...)
+-#   define LOG_ERR(...)
+-#   define LOG_DBG(...)
+-#else // defined(LLAVA_LOG_OFF)
+-#   define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+-#   define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+-#   define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+-#   define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+-#endif // defined(LLAVA_LOG_OFF)
++#include "common/log.h"
+
+ //#define CLIP_DEBUG_FUNCTIONS
+
+@@ -1169,6 +1163,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
      LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
  #endif
 
@@ -26,30 +45,29 @@ index aae49c96..e6060d82 100644
          new_clip->backend = ggml_backend_cpu_init();
          LOG_INF("%s: CLIP using CPU backend\n", __func__);
 diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
-index be698854..448954b1 100644
+index 4ca53a0b..924ad064 100644
 --- a/examples/llava/llava.cpp
 +++ b/examples/llava/llava.cpp
-@@ -2,6 +2,7 @@
- #include "llava.h"
+@@ -11,17 +11,7 @@
+ #include <limits>
+ #include <vector>
 
- #include "llama.h"
+-#if defined(LLAVA_LOG_OFF)
+-#   define LOG_INF(...)
+-#   define LOG_WRN(...)
+-#   define LOG_ERR(...)
+-#   define LOG_DBG(...)
+-#else // defined(LLAVA_LOG_OFF)
+-#   define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+-#   define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+-#   define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+-#   define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+-#endif // defined(LLAVA_LOG_OFF)
 +#include "common/log.h"
 
- #include <algorithm>
- #include <cerrno>
-@@ -14,11 +15,6 @@
- #define die(msg)          do { fputs("error: " msg "\n", stderr);                exit(1); } while (0)
- #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
-
--#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
--#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
--#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
--#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
--
  // RGB uint8 image
  struct clip_image_u8 {
-     int nx;
-@@ -362,7 +358,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
+@@ -366,7 +356,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
      const int64_t t_img_enc_end_us = ggml_time_us();
      float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
 

diff --git a/llama-box/patches/llama.cpp/embedding.patch b/llama-box/patches/llama.cpp/embedding.patch
@@ -12,10 +12,10 @@ index ab5e376e..658fd56a 100644
  }
  #endif
 diff --git a/src/llama.cpp b/src/llama.cpp
-index af5e686e..7fc47b78 100644
+index 22b951ba..72bcce69 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -19693,10 +19693,10 @@ struct llama_context * llama_new_context_with_model(
+@@ -19698,10 +19698,10 @@ struct llama_context * llama_new_context_with_model(
      cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
 
      // this is necessary due to kv_self.n being padded later during inference
@@ -28,7 +28,7 @@ index af5e686e..7fc47b78 100644
 
      // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
      // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
-@@ -19706,7 +19706,7 @@ struct llama_context * llama_new_context_with_model(
+@@ -19711,7 +19711,7 @@ struct llama_context * llama_new_context_with_model(
          cparams.n_batch = GGML_KQ_MASK_PAD;
      }
 
@@ -37,7 +37,7 @@ index af5e686e..7fc47b78 100644
 
      cparams.n_ctx_orig_yarn  = params.yarn_orig_ctx    != 0 ? params.yarn_orig_ctx    :
                                 hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
-@@ -22313,3 +22313,7 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
+@@ -22318,3 +22318,7 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
      fputs(text, stderr);
      fflush(stderr);
  }

diff --git a/llama-box/patches/llama.cpp/model.patch b/llama-box/patches/llama.cpp/model.patch
@@ -1,5 +1,5 @@
 diff --git a/src/llama.cpp b/src/llama.cpp
-index af5e686e..2d70ebcb 100644
+index 22b951ba..012c547e 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
 @@ -158,6 +158,7 @@ enum llm_arch {
@@ -42,7 +42,7 @@ index af5e686e..2d70ebcb 100644
              { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
              { LLM_TENSOR_LAYER_OUT_NORM,  "blk.%d.layer_output_norm" },
              { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-@@ -5610,6 +5629,17 @@ static void llm_load_hparams(
+@@ -5612,6 +5631,17 @@ static void llm_load_hparams(
                          model.type = e_model::MODEL_335M; break; // bge-large
                  }
              } break;
@@ -60,7 +60,7 @@ index af5e686e..2d70ebcb 100644
          case LLM_ARCH_JINA_BERT_V2:
              {
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-@@ -7849,13 +7879,14 @@ static bool llm_load_tensors(
+@@ -7854,13 +7884,14 @@ static bool llm_load_tensors(
                      }
                  } break;
              case LLM_ARCH_BERT:
@@ -77,7 +77,7 @@ index af5e686e..2d70ebcb 100644
 
                          model.cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
                          model.cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {n_embd},         llama_model_loader::TENSOR_NOT_REQUIRED);
-@@ -7870,8 +7901,8 @@ static bool llm_load_tensors(
+@@ -7875,8 +7906,8 @@ static bool llm_load_tensors(
                      for (int i = 0; i < n_layer; ++i) {
                          auto & layer = model.layers[i];
 
@@ -88,15 +88,15 @@ index af5e686e..2d70ebcb 100644
                              layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i),   {n_embd}, 0);
 
                              layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
-@@ -7881,6 +7912,7 @@ static bool llm_load_tensors(
+@@ -7886,6 +7917,7 @@ static bool llm_load_tensors(
                              layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i),   {n_embd_gqa}, 0);
                          } else {
                              layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
 +                            layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
                          }
 
                          layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {n_embd, n_embd}, 0);
-@@ -7888,12 +7920,16 @@ static bool llm_load_tensors(
+@@ -7893,12 +7925,16 @@ static bool llm_load_tensors(
                          layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
                          layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i),   {n_embd}, 0);
 
@@ -116,7 +116,7 @@ index af5e686e..2d70ebcb 100644
                              layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
                          } else {
                              layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
-@@ -11693,7 +11729,7 @@ struct llm_build_context {
+@@ -11698,7 +11734,7 @@ struct llm_build_context {
          // token types are hardcoded to zero ("Sentence A")
          struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
          inpL = ggml_add(ctx0, inpL, type_row0);
@@ -125,7 +125,7 @@ index af5e686e..2d70ebcb 100644
              inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
          }
          cb(inpL, "inp_embd", -1);
-@@ -11714,7 +11750,7 @@ struct llm_build_context {
+@@ -11719,7 +11755,7 @@ struct llm_build_context {
              struct ggml_tensor * Vcur;
 
              // self-attention
@@ -134,7 +134,7 @@ index af5e686e..2d70ebcb 100644
                  Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur), model.layers[il].bq);
                  cb(Qcur, "Qcur", il);
 
-@@ -11744,6 +11780,11 @@ struct llm_build_context {
+@@ -11749,6 +11785,11 @@ struct llm_build_context {
                  cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
                  cb(cur, "wqkv", il);
 
@@ -146,7 +146,7 @@ index af5e686e..2d70ebcb 100644
                  Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
                  Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
                  Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
-@@ -11829,6 +11870,13 @@ struct llm_build_context {
+@@ -11834,6 +11875,13 @@ struct llm_build_context {
                          model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                          NULL,
                          LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
@@ -160,15 +160,15 @@ index af5e686e..2d70ebcb 100644
              } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
                  cur = llm_build_ffn(ctx0, lctx, cur,
                          model.layers[il].ffn_up,   NULL,                        NULL,
-@@ -16702,6 +16750,7 @@ static struct ggml_cgraph * llama_build_graph(
+@@ -16707,6 +16755,7 @@ static struct ggml_cgraph * llama_build_graph(
                  result = llm.build_refact();
              } break;
          case LLM_ARCH_BERT:
 +        case LLM_ARCH_NEW:
          case LLM_ARCH_JINA_BERT_V2:
          case LLM_ARCH_NOMIC_BERT:
              {
-@@ -20078,6 +20127,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
+@@ -20083,6 +20132,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
          case LLM_ARCH_GROK:
          case LLM_ARCH_DBRX:
          case LLM_ARCH_BERT:

diff --git a/llama-box/patches/llama.cpp/template.patch b/llama-box/patches/llama.cpp/template.patch
@@ -1,8 +1,8 @@
 diff --git a/src/llama.cpp b/src/llama.cpp
-index af5e686e..193c447a 100644
+index 22b951ba..b8c1810c 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -22118,6 +22118,26 @@ static int32_t llama_chat_apply_template_internal(
+@@ -22123,6 +22123,26 @@ static int32_t llama_chat_apply_template_internal(
          if (add_ass) {
              ss << "<|start_of_role|>assistant<|end_of_role|>\n";
          }

diff --git a/llama-box/patches/llama.cpp/token.patch b/llama-box/patches/llama.cpp/token.patch
@@ -38,10 +38,10 @@ index 4bb16d2e..528f0291 100644
  llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
  llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
 diff --git a/src/llama.cpp b/src/llama.cpp
-index af5e686e..b8f4b271 100644
+index 22b951ba..f3b04160 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -21754,6 +21754,10 @@ llama_token llama_token_pad(const struct llama_model * model) {
+@@ -21759,6 +21759,10 @@ llama_token llama_token_pad(const struct llama_model * model) {
      return llama_token_pad_impl(model->vocab);
  }
 

diff --git a/llama-box/patches/llama.cpp/tokenizer.patch b/llama-box/patches/llama.cpp/tokenizer.patch
@@ -1,8 +1,8 @@
 diff --git a/src/llama.cpp b/src/llama.cpp
-index af5e686e..e1c3fef9 100644
+index 22b951ba..a5c30a00 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -6296,16 +6296,7 @@ static void llm_load_vocab(
+@@ -6301,16 +6301,7 @@ static void llm_load_vocab(
          if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
              vocab.tokenizer_add_space_prefix = false;
              vocab.tokenizer_clean_spaces = true;
@@ -20,7 +20,7 @@ index af5e686e..e1c3fef9 100644
                  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
              } else if (
                      tokenizer_pre == "llama3"   ||
-@@ -6407,7 +6398,8 @@ static void llm_load_vocab(
+@@ -6412,7 +6403,8 @@ static void llm_load_vocab(
                  vocab.tokenizer_add_bos = true;
                  vocab.tokenizer_clean_spaces = false;
              } else {