Skip to content

Commit

Permalink
refactor: align upstream
Browse files Browse the repository at this point in the history
Signed-off-by: thxCode <thxcode0824@gmail.com>
  • Loading branch information
thxCode committed Nov 29, 2024
1 parent d1db90d commit c8e0f6e
Show file tree
Hide file tree
Showing 7 changed files with 68 additions and 50 deletions.
16 changes: 8 additions & 8 deletions llama-box/patches/llama.cpp/cann.patch
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index bcb54e44..5045a27d 100644
index 04e25b8a..92f2ff60 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -471,7 +471,10 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
Expand Down Expand Up @@ -91,7 +91,7 @@ index bcb54e44..5045a27d 100644

return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface,
ctx, size);
@@ -1922,17 +1912,17 @@ struct ggml_backend_cann_device_context {
@@ -1911,17 +1901,17 @@ struct ggml_backend_cann_device_context {
};

static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
Expand All @@ -112,7 +112,7 @@ index bcb54e44..5045a27d 100644
ggml_backend_cann_get_device_memory(ctx->device, free, total);
}

@@ -1959,7 +1949,7 @@ static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_back
@@ -1948,7 +1938,7 @@ static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_back

static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, const char * params) {
GGML_UNUSED(params);
Expand All @@ -121,7 +121,7 @@ index bcb54e44..5045a27d 100644
return ggml_backend_cann_init(ctx->device);
}

@@ -1979,7 +1969,7 @@ static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, cons
@@ -1968,7 +1958,7 @@ static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, cons
static bool ggml_backend_cann_supports_buft(
ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
if (ggml_backend_buft_is_cann(buft)) {
Expand All @@ -130,7 +130,7 @@ index bcb54e44..5045a27d 100644
ggml_backend_cann_buffer_type_context * buft_ctx =
(ggml_backend_cann_buffer_type_context *)buft->context;
return buft_ctx->device == dev_ctx->device;
@@ -1988,7 +1978,7 @@ static bool ggml_backend_cann_supports_buft(
@@ -1977,7 +1967,7 @@ static bool ggml_backend_cann_supports_buft(
}

static ggml_backend_buffer_type_t ggml_backend_cann_device_get_buffer_type(ggml_backend_dev_t dev) {
Expand All @@ -139,7 +139,7 @@ index bcb54e44..5045a27d 100644
return ggml_backend_cann_buffer_type(ctx->device);
}

@@ -2009,7 +1999,7 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(
@@ -1998,7 +1988,7 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(
*/
static ggml_backend_event_t ggml_backend_cann_device_event_new(
ggml_backend_dev_t dev) {
Expand All @@ -148,7 +148,7 @@ index bcb54e44..5045a27d 100644

ggml_cann_set_device(dev_ctx->device);

@@ -2117,11 +2107,7 @@ ggml_backend_reg_t ggml_backend_cann_reg() {
@@ -2106,11 +2096,7 @@ ggml_backend_reg_t ggml_backend_cann_reg() {
ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;

for (int i = 0; i < ggml_cann_info().device_count; i++) {
Expand All @@ -161,7 +161,7 @@ index bcb54e44..5045a27d 100644
ggml_backend_dev_t dev = new ggml_backend_device {
/* .iface = */ ggml_backend_cann_device_interface,
/* .reg = */ &reg,
@@ -2150,17 +2136,12 @@ ggml_backend_t ggml_backend_cann_init(int32_t device) {
@@ -2139,17 +2125,12 @@ ggml_backend_t ggml_backend_cann_init(int32_t device) {
return nullptr;
}

Expand Down
56 changes: 37 additions & 19 deletions llama-box/patches/llama.cpp/clip.patch
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index aae49c96..e6060d82 100644
index 7ba4cea5..3def24e1 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -24,6 +24,10 @@
Expand All @@ -13,7 +13,26 @@ index aae49c96..e6060d82 100644
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"

@@ -1162,6 +1166,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
@@ -40,17 +44,7 @@
#include <cinttypes>
#include <limits>

-#if defined(LLAVA_LOG_OFF)
-# define LOG_INF(...)
-# define LOG_WRN(...)
-# define LOG_ERR(...)
-# define LOG_DBG(...)
-#else // defined(LLAVA_LOG_OFF)
-# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
-# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
-#endif // defined(LLAVA_LOG_OFF)
+#include "common/log.h"

//#define CLIP_DEBUG_FUNCTIONS

@@ -1169,6 +1163,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
#endif

Expand All @@ -26,30 +45,29 @@ index aae49c96..e6060d82 100644
new_clip->backend = ggml_backend_cpu_init();
LOG_INF("%s: CLIP using CPU backend\n", __func__);
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index be698854..448954b1 100644
index 4ca53a0b..924ad064 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -2,6 +2,7 @@
#include "llava.h"
@@ -11,17 +11,7 @@
#include <limits>
#include <vector>

#include "llama.h"
-#if defined(LLAVA_LOG_OFF)
-# define LOG_INF(...)
-# define LOG_WRN(...)
-# define LOG_ERR(...)
-# define LOG_DBG(...)
-#else // defined(LLAVA_LOG_OFF)
-# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
-# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
-#endif // defined(LLAVA_LOG_OFF)
+#include "common/log.h"

#include <algorithm>
#include <cerrno>
@@ -14,11 +15,6 @@
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)

-#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
-#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
-
// RGB uint8 image
struct clip_image_u8 {
int nx;
@@ -362,7 +358,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
@@ -366,7 +356,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
const int64_t t_img_enc_end_us = ggml_time_us();
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;

Expand Down
8 changes: 4 additions & 4 deletions llama-box/patches/llama.cpp/embedding.patch
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ index ab5e376e..658fd56a 100644
}
#endif
diff --git a/src/llama.cpp b/src/llama.cpp
index af5e686e..7fc47b78 100644
index 22b951ba..72bcce69 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -19693,10 +19693,10 @@ struct llama_context * llama_new_context_with_model(
@@ -19698,10 +19698,10 @@ struct llama_context * llama_new_context_with_model(
cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;

// this is necessary due to kv_self.n being padded later during inference
Expand All @@ -28,7 +28,7 @@ index af5e686e..7fc47b78 100644

// the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
// this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
@@ -19706,7 +19706,7 @@ struct llama_context * llama_new_context_with_model(
@@ -19711,7 +19711,7 @@ struct llama_context * llama_new_context_with_model(
cparams.n_batch = GGML_KQ_MASK_PAD;
}

Expand All @@ -37,7 +37,7 @@ index af5e686e..7fc47b78 100644

cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
@@ -22313,3 +22313,7 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
@@ -22318,3 +22318,7 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
fputs(text, stderr);
fflush(stderr);
}
Expand Down
24 changes: 12 additions & 12 deletions llama-box/patches/llama.cpp/model.patch
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
diff --git a/src/llama.cpp b/src/llama.cpp
index af5e686e..2d70ebcb 100644
index 22b951ba..012c547e 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -158,6 +158,7 @@ enum llm_arch {
Expand Down Expand Up @@ -42,7 +42,7 @@ index af5e686e..2d70ebcb 100644
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
@@ -5610,6 +5629,17 @@ static void llm_load_hparams(
@@ -5612,6 +5631,17 @@ static void llm_load_hparams(
model.type = e_model::MODEL_335M; break; // bge-large
}
} break;
Expand All @@ -60,7 +60,7 @@ index af5e686e..2d70ebcb 100644
case LLM_ARCH_JINA_BERT_V2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -7849,13 +7879,14 @@ static bool llm_load_tensors(
@@ -7854,13 +7884,14 @@ static bool llm_load_tensors(
}
} break;
case LLM_ARCH_BERT:
Expand All @@ -77,7 +77,7 @@ index af5e686e..2d70ebcb 100644

model.cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
model.cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
@@ -7870,8 +7901,8 @@ static bool llm_load_tensors(
@@ -7875,8 +7906,8 @@ static bool llm_load_tensors(
for (int i = 0; i < n_layer; ++i) {
auto & layer = model.layers[i];

Expand All @@ -88,15 +88,15 @@ index af5e686e..2d70ebcb 100644
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);

layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
@@ -7881,6 +7912,7 @@ static bool llm_load_tensors(
@@ -7886,6 +7917,7 @@ static bool llm_load_tensors(
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
} else {
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
}

layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
@@ -7888,12 +7920,16 @@ static bool llm_load_tensors(
@@ -7893,12 +7925,16 @@ static bool llm_load_tensors(
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);

Expand All @@ -116,7 +116,7 @@ index af5e686e..2d70ebcb 100644
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
} else {
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
@@ -11693,7 +11729,7 @@ struct llm_build_context {
@@ -11698,7 +11734,7 @@ struct llm_build_context {
// token types are hardcoded to zero ("Sentence A")
struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
inpL = ggml_add(ctx0, inpL, type_row0);
Expand All @@ -125,7 +125,7 @@ index af5e686e..2d70ebcb 100644
inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
}
cb(inpL, "inp_embd", -1);
@@ -11714,7 +11750,7 @@ struct llm_build_context {
@@ -11719,7 +11755,7 @@ struct llm_build_context {
struct ggml_tensor * Vcur;

// self-attention
Expand All @@ -134,7 +134,7 @@ index af5e686e..2d70ebcb 100644
Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur), model.layers[il].bq);
cb(Qcur, "Qcur", il);

@@ -11744,6 +11780,11 @@ struct llm_build_context {
@@ -11749,6 +11785,11 @@ struct llm_build_context {
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
cb(cur, "wqkv", il);

Expand All @@ -146,7 +146,7 @@ index af5e686e..2d70ebcb 100644
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
@@ -11829,6 +11870,13 @@ struct llm_build_context {
@@ -11834,6 +11875,13 @@ struct llm_build_context {
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
Expand All @@ -160,15 +160,15 @@ index af5e686e..2d70ebcb 100644
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
cur = llm_build_ffn(ctx0, lctx, cur,
model.layers[il].ffn_up, NULL, NULL,
@@ -16702,6 +16750,7 @@ static struct ggml_cgraph * llama_build_graph(
@@ -16707,6 +16755,7 @@ static struct ggml_cgraph * llama_build_graph(
result = llm.build_refact();
} break;
case LLM_ARCH_BERT:
+ case LLM_ARCH_NEW:
case LLM_ARCH_JINA_BERT_V2:
case LLM_ARCH_NOMIC_BERT:
{
@@ -20078,6 +20127,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
@@ -20083,6 +20132,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_GROK:
case LLM_ARCH_DBRX:
case LLM_ARCH_BERT:
Expand Down
4 changes: 2 additions & 2 deletions llama-box/patches/llama.cpp/template.patch
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
diff --git a/src/llama.cpp b/src/llama.cpp
index af5e686e..193c447a 100644
index 22b951ba..b8c1810c 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -22118,6 +22118,26 @@ static int32_t llama_chat_apply_template_internal(
@@ -22123,6 +22123,26 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "<|start_of_role|>assistant<|end_of_role|>\n";
}
Expand Down
4 changes: 2 additions & 2 deletions llama-box/patches/llama.cpp/token.patch
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ index 4bb16d2e..528f0291 100644
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
diff --git a/src/llama.cpp b/src/llama.cpp
index af5e686e..b8f4b271 100644
index 22b951ba..f3b04160 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -21754,6 +21754,10 @@ llama_token llama_token_pad(const struct llama_model * model) {
@@ -21759,6 +21759,10 @@ llama_token llama_token_pad(const struct llama_model * model) {
return llama_token_pad_impl(model->vocab);
}

Expand Down
6 changes: 3 additions & 3 deletions llama-box/patches/llama.cpp/tokenizer.patch
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
diff --git a/src/llama.cpp b/src/llama.cpp
index af5e686e..e1c3fef9 100644
index 22b951ba..a5c30a00 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -6296,16 +6296,7 @@ static void llm_load_vocab(
@@ -6301,16 +6301,7 @@ static void llm_load_vocab(
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
vocab.tokenizer_add_space_prefix = false;
vocab.tokenizer_clean_spaces = true;
Expand All @@ -20,7 +20,7 @@ index af5e686e..e1c3fef9 100644
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
tokenizer_pre == "llama3" ||
@@ -6407,7 +6398,8 @@ static void llm_load_vocab(
@@ -6412,7 +6403,8 @@ static void llm_load_vocab(
vocab.tokenizer_add_bos = true;
vocab.tokenizer_clean_spaces = false;
} else {
Expand Down

0 comments on commit c8e0f6e

Please sign in to comment.