Skip to content

Commit

Permalink
Performance improvements on Arm for legacy and k-quants (#453)
Browse files Browse the repository at this point in the history
  • Loading branch information
ikawrakow authored May 30, 2024
1 parent 73088c3 commit 293a528
Show file tree
Hide file tree
Showing 10 changed files with 1,264 additions and 87 deletions.
15 changes: 14 additions & 1 deletion llama.cpp/ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,18 @@ typedef struct {
} block_q8_1;
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");

//[kawrakow] Need these two for performance on Arm
typedef struct {
ggml_half d[8];
int8_t qs[4*QK8_1];
} block_q8_1_x4;
static_assert(sizeof(block_q8_1_x4) == 4*sizeof(block_q8_1), "wrong q8_1_x4 block size/padding");
typedef struct {
ggml_half d[4];
int8_t qs[4*QK8_0];
} block_q8_0_x4;
static_assert(sizeof(block_q8_0_x4) == 4*sizeof(block_q8_0), "wrong q8_0_x4 block size/padding");

//
// Super-block quantization structures
//
Expand Down Expand Up @@ -313,10 +325,11 @@ typedef struct {
static_assert(sizeof(block_q6_K) == sizeof(ggml_half) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");

// This is only used for intermediate quantization and dot products
// [kawrakow] Note: I have switched the order of bsums and qs. This results in some performance gain on Arm
typedef struct {
float d; // delta
int8_t qs[QK_K]; // quants
int16_t bsums[QK_K/16]; // sum of quants in groups of 16
int8_t qs[QK_K]; // quants
} block_q8_K;
static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");

Expand Down
61 changes: 50 additions & 11 deletions llama.cpp/ggml-quants.inc
Original file line number Diff line number Diff line change
Expand Up @@ -873,7 +873,11 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
block_q8_0 * restrict y = vy;

#if defined(__ARM_NEON)
// [kawrakow] When running on Arm, we change how the data is layed out for performance reasons
block_q8_0_x4 * y4 = (block_q8_0_x4 *)vy;
int nb4 = 4*(nb/4);
for (int i = 0; i < nb; i++) {
int i4 = i/4, ir = i%4;
float32x4_t srcv [8];
float32x4_t asrcv[8];
float32x4_t amaxv[8];
Expand All @@ -890,16 +894,29 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
const float d = amax / ((1 << 7) - 1);
const float id = d ? 1.0f/d : 0.0f;

y[i].d = GGML_FP32_TO_FP16(d);
// [kawrakow] When running on Arm, we change how the data is layed out for performance reasons
if (i < nb4) {
y4[i4].d[ir] = GGML_FP32_TO_FP16(d);
} else {
y[i].d = GGML_FP32_TO_FP16(d);
}

for (int j = 0; j < 8; j++) {
const float32x4_t v = vmulq_n_f32(srcv[j], id);
const int32x4_t vi = vcvtnq_s32_f32(v);

y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
// [kawrakow] When running on Arm, we change how the data is layed out for performance reasons
if (i < nb4) {
y4[i4].qs[32*ir + 4*j + 0] = vgetq_lane_s32(vi, 0);
y4[i4].qs[32*ir + 4*j + 1] = vgetq_lane_s32(vi, 1);
y4[i4].qs[32*ir + 4*j + 2] = vgetq_lane_s32(vi, 2);
y4[i4].qs[32*ir + 4*j + 3] = vgetq_lane_s32(vi, 3);
} else {
y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
}
}
}
#elif defined(__wasm_simd128__)
Expand Down Expand Up @@ -1192,7 +1209,11 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
block_q8_1 * restrict y = vy;

#if defined(__ARM_NEON)
// [kawrakow] When running on Arm, we change how the data is layed out for performance reasons
block_q8_1_x4 * restrict y4 = vy;
int nb4 = 4*(nb/4);
for (int i = 0; i < nb; i++) {
int i4 = i/4, ir = i%4;
float32x4_t srcv [8];
float32x4_t asrcv[8];
float32x4_t amaxv[8];
Expand All @@ -1209,23 +1230,41 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
const float d = amax / ((1 << 7) - 1);
const float id = d ? 1.0f/d : 0.0f;

y[i].d = GGML_FP32_TO_FP16(d);
// [kawrakow] When running on Arm, we change how the data is layed out for performance reasons
if (i < nb4) {
y4[i4].d[ir] = GGML_FP32_TO_FP16(d);
} else {
y[i].d = GGML_FP32_TO_FP16(d);
}

int32x4_t accv = vdupq_n_s32(0);

for (int j = 0; j < 8; j++) {
const float32x4_t v = vmulq_n_f32(srcv[j], id);
const int32x4_t vi = vcvtnq_s32_f32(v);

y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
// [kawrakow] When running on Arm, we change how the data is layed out for performance reasons
if (i < nb4) {
y4[i4].qs[QK8_1*ir + 4*j + 0] = vgetq_lane_s32(vi, 0);
y4[i4].qs[QK8_1*ir + 4*j + 1] = vgetq_lane_s32(vi, 1);
y4[i4].qs[QK8_1*ir + 4*j + 2] = vgetq_lane_s32(vi, 2);
y4[i4].qs[QK8_1*ir + 4*j + 3] = vgetq_lane_s32(vi, 3);
} else {
y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
}

accv = vaddq_s32(accv, vi);
}

y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
// [kawrakow] When running on Arm, we change how the data is layed out for performance reasons
if (i < nb4) {
y4[i4].d[ir+4] = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
} else {
y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
}
}
#elif defined(__wasm_simd128__)
for (int i = 0; i < nb; i++) {
Expand Down
7 changes: 6 additions & 1 deletion llama.cpp/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,12 @@ static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix
static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS = "quantize.imatrix.chunks_count";

static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
std::string ftype_str;
std::string ftype_str; ftype_str.reserve(ftype_str_in.size());

bool is_number = true;
for (auto ch : ftype_str_in) {
ftype_str.push_back(std::toupper(ch));
if (!std::isdigit(ftype_str.back())) is_number = false;
}
for (auto & it : QUANT_OPTIONS) {
if (it.name == ftype_str) {
Expand All @@ -77,6 +79,9 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
return true;
}
}
// On my system (OS Ventura 13.2.1) calling std::stoi with invalid input leads to a crash (Segmentation fault 11)
// Hence the check above and the early return
if (!is_number) return false;
try {
int ftype_int = std::stoi(ftype_str);
for (auto & it : QUANT_OPTIONS) {
Expand Down
1 change: 1 addition & 0 deletions llamafile/BUILD.mk
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ o/$(MODE)/llamafile: \
o/$(MODE)/llamafile/sgemm.o: private CXXFLAGS += -Os
o/$(MODE)/llamafile/iqk_mul_mat_amd_avx2.o: private TARGET_ARCH += -Xx86_64-mtune=skylake -Xx86_64-mavx2 -Xx86_64-mfma -Xx86_64-mf16c
o/$(MODE)/llamafile/iqk_mul_mat_amd_zen4.o: private TARGET_ARCH += -Xx86_64-mtune=skylake -Xx86_64-mavx2 -Xx86_64-mfma -Xx86_64-mf16c -Xx86_64-mavx512f -Xx86_64-mavx512vl -Xx86_64-mavx512vnni -Xx86_64-mavx512bw -Xx86_64-mavx512dq
o/$(MODE)/llamafile/iqk_mul_mat_arm82.o: private TARGET_ARCH += -Xaarch64-march=armv8.2-a+dotprod+fp16
o/$(MODE)/llamafile/tinyblas_cpu_sgemm_amd_avx.o: private TARGET_ARCH += -Xx86_64-mtune=sandybridge -Xx86_64-mf16c
o/$(MODE)/llamafile/tinyblas_cpu_mixmul_amd_avx.o: private TARGET_ARCH += -Xx86_64-mtune=sandybridge -Xx86_64-mf16c
o/$(MODE)/llamafile/tinyblas_cpu_sgemm_amd_fma.o: private TARGET_ARCH += -Xx86_64-mtune=bdver2 -Xx86_64-mf16c -Xx86_64-mfma
Expand Down
Loading

0 comments on commit 293a528

Please sign in to comment.