From 0fd8363adc3f248820f3908d5845c61c2fa36f6f Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Thu, 20 Apr 2023 02:04:00 +0300
Subject: [PATCH 01/51] use hipblas based on cublas

---
 CMakeLists.txt | 26 ++++++++++++++++++++++++++
 Makefile       |  4 ++++
 ggml-cuda.cu   |  6 ++++++
 ggml.c         | 35 +++++++++++++++++++++++++++++++++--
 4 files changed, 69 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1f9fdd30f0830..57cce9bb05d76 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -67,6 +67,7 @@ endif()
 option(LLAMA_ACCELERATE             "llama: enable Accelerate framework"                    ON)
 option(LLAMA_OPENBLAS               "llama: use OpenBLAS"                                   OFF)
 option(LLAMA_CUBLAS                 "llama: use cuBLAS"                                     OFF)
+option(LLAMA_HIPBLAS                "llama: use hipBLAS"                                    OFF)
 
 option(LLAMA_BUILD_TESTS            "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES         "llama: build examples" ${LLAMA_STANDALONE})
@@ -168,6 +169,31 @@ if (LLAMA_CUBLAS)
     endif()
 endif()
 
+if (LLAMA_HIPBLAS)
+    cmake_minimum_required(VERSION 3.21)
+
+    find_package(hip)
+    find_package(hipblas)
+
+    if (hipblas_FOUND)
+        message(STATUS "hipBLAS found")
+
+        set(LLAMA_HIPBLAS_PLATFORM "AMD" CACHE STRING "hip device type" FORCE)
+        set_property(CACHE LLAMA_HIPBLAS_PLATFORM PROPERTY STRINGS "AMD" "NVIDIA")
+
+        add_compile_definitions(GGML_USE_HIPBLAS "__HIP_PLATFORM_${LLAMA_HIPBLAS_PLATFORM}__")
+
+        add_library(ggml-hip OBJECT ggml-cuda.cu)
+        set_source_files_properties(ggml-cuda.cu PROPERTIES  LANGUAGE  CXX)
+        target_link_libraries(ggml-hip hip::device)
+
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::host roc::hipblas ggml-hip)
+
+    else()
+        message(WARNING "hipBLAS not found")
+    endif()
+endif()
+
 if (LLAMA_ALL_WARNINGS)
     if (NOT MSVC)
         set(c_flags
diff --git a/Makefile b/Makefile
index f267d086415ee..d2f30e1cc9674 100644
--- a/Makefile
+++ b/Makefile
@@ -107,6 +107,10 @@ ifdef LLAMA_CUBLAS
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	nvcc -arch=native -c -o $@ $<
 endif
+ifdef LLAMA_HIPBLAS
+	CFLAGS  += -DGGML_USE_HIPBLAS -D__HIP_PLATFORM_AMD__ -I/opt/rocm/include
+	LDFLAGS += -lhipblas -lamdhip64 -L/opt/rocm/lib
+endif
 ifdef LLAMA_GPROF
 	CFLAGS   += -pg
 	CXXFLAGS += -pg
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 0baa989a36ca9..90830e5fd2976 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -1,5 +1,11 @@
 #include <stdint.h>
+#if defined(__HIP_PLATFORM_AMD__)
+#include "hip/hip_runtime.h"
+#define cudaStream_t hipStream_t
+#define __half _Float16
+#else
 #include <cuda_fp16.h>
+#endif
 #include "ggml-cuda.h"
 
 typedef uint16_t ggml_fp16_t;
diff --git a/ggml.c b/ggml.c
index da0f5d1d549ab..23befa297136d 100644
--- a/ggml.c
+++ b/ggml.c
@@ -147,9 +147,41 @@ inline static void* ggml_aligned_malloc(size_t size) {
 #include <Accelerate/Accelerate.h>
 #elif defined(GGML_USE_OPENBLAS)
 #include <cblas.h>
-#elif defined(GGML_USE_CUBLAS)
+#elif defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS)
+
+#if defined(GGML_USE_HIPBLAS)
+#include "hipblas/hipblas.h"
+#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
+#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_OP_N HIPBLAS_OP_N
+#define CUBLAS_OP_T HIPBLAS_OP_T
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define cublasCreate hipblasCreate
+#define cublasGemmEx hipblasGemmEx
+#define cublasHandle_t hipblasHandle_t
+#define cublasSetStream hipblasSetStream
+#define cublasSgemm hipblasSgemm
+#define cublasStatus_t hipblasStatus_t
+#define CUDA_R_16F  HIPBLAS_R_16F
+#define CUDA_R_32F  HIPBLAS_R_32F
+#define cudaError_t hipError_t
+#define cudaFree hipFree
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetLastError hipGetLastError
+#define cudaMalloc hipMalloc
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaStream_t hipStream_t
+#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
+#define cudaStreamNonBlocking hipStreamNonBlocking
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaSuccess hipSuccess
+#define GGML_USE_CUBLAS
+#else
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
+#endif
 #include "ggml-cuda.h"
 
 #define CUDA_CHECK(err)                                                        \
@@ -8073,7 +8105,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
                 const float * x = wdata;
 #endif
 
-
 #if defined(GGML_USE_CUBLAS)
                 // copy data to device
                 CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, cudaStream));

From 54a63c10e85bf454eb1ea99cc27d89cce06144b6 Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Thu, 20 Apr 2023 22:19:22 +0300
Subject: [PATCH 02/51] Update Makefile for the Cuda kernels

---
 Makefile | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index d2f30e1cc9674..8e0ada581e698 100644
--- a/Makefile
+++ b/Makefile
@@ -108,8 +108,15 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	nvcc -arch=native -c -o $@ $<
 endif
 ifdef LLAMA_HIPBLAS
-	CFLAGS  += -DGGML_USE_HIPBLAS -D__HIP_PLATFORM_AMD__ -I/opt/rocm/include
-	LDFLAGS += -lhipblas -lamdhip64 -L/opt/rocm/lib
+	ROCMPATH?= /opt/rocm
+	CFLAGS  += -DGGML_USE_HIPBLAS -D__HIP_PLATFORM_AMD__ -I$(ROCMPATH)/include
+	CXXFLAGS+= -D__HIP_PLATFORM_AMD__ -I$(ROCMPATH)/include
+	HIPFLAGS?= -amdgpu-early-inline-all=true -amdgpu-function-calls=false -march=native
+	LDFLAGS += -lhipblas -lamdhip64 -L$(ROCMPATH)/lib
+	HIPCC   ?= $(ROCMPATH)/bin/hipcc
+	OBJS	+= ggml-cuda.o
+ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
+	$(HIPCC) $(CXXFLAGS) -x hip $(HIPFLAGS) -c -o $@ $<
 endif
 ifdef LLAMA_GPROF
 	CFLAGS   += -pg

From 0e005f779357c9594b942adaf8d985edb071642a Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Fri, 21 Apr 2023 02:13:00 +0300
Subject: [PATCH 03/51] Build file changes

Now HIP Clang is not required, the CMake scripts will configure the
needed compiler, which can be system clang++. Also other code can
still use GCC, but CMake will force the clang to link.
---
 CMakeLists.txt | 24 ++++++++++--------------
 Makefile       | 13 ++++++-------
 2 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 57cce9bb05d76..cea51078dcdd9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -170,27 +170,23 @@ if (LLAMA_CUBLAS)
 endif()
 
 if (LLAMA_HIPBLAS)
-    cmake_minimum_required(VERSION 3.21)
-
     find_package(hip)
     find_package(hipblas)
 
-    if (hipblas_FOUND)
+    if (${hipblas_FOUND} AND ${hip_FOUND})
         message(STATUS "hipBLAS found")
+        add_compile_definitions(GGML_USE_HIPBLAS)
+        enable_language(HIP)
+        add_library(ggml-hip OBJECT ggml-cuda.cu ggml-cuda.h)
+        set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE HIP)
+        target_link_libraries(ggml-hip PRIVATE hip::device)
 
-        set(LLAMA_HIPBLAS_PLATFORM "AMD" CACHE STRING "hip device type" FORCE)
-        set_property(CACHE LLAMA_HIPBLAS_PLATFORM PROPERTY STRINGS "AMD" "NVIDIA")
-
-        add_compile_definitions(GGML_USE_HIPBLAS "__HIP_PLATFORM_${LLAMA_HIPBLAS_PLATFORM}__")
-
-        add_library(ggml-hip OBJECT ggml-cuda.cu)
-        set_source_files_properties(ggml-cuda.cu PROPERTIES  LANGUAGE  CXX)
-        target_link_libraries(ggml-hip hip::device)
-
+        if (LLAMA_STATIC)
+            message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
+        endif()
         set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::host roc::hipblas ggml-hip)
-
     else()
-        message(WARNING "hipBLAS not found")
+        message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
     endif()
 endif()
 
diff --git a/Makefile b/Makefile
index 8e0ada581e698..5b856a3677b96 100644
--- a/Makefile
+++ b/Makefile
@@ -108,13 +108,12 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	nvcc -arch=native -c -o $@ $<
 endif
 ifdef LLAMA_HIPBLAS
-	ROCMPATH?= /opt/rocm
-	CFLAGS  += -DGGML_USE_HIPBLAS -D__HIP_PLATFORM_AMD__ -I$(ROCMPATH)/include
-	CXXFLAGS+= -D__HIP_PLATFORM_AMD__ -I$(ROCMPATH)/include
-	HIPFLAGS?= -amdgpu-early-inline-all=true -amdgpu-function-calls=false -march=native
-	LDFLAGS += -lhipblas -lamdhip64 -L$(ROCMPATH)/lib
-	HIPCC   ?= $(ROCMPATH)/bin/hipcc
-	OBJS	+= ggml-cuda.o
+	ROCM_PATH ?= /opt/rocm
+	LDFLAGS   += -lhipblas -lamdhip64 -L$(ROCM_PATH)/lib
+	HIPCC     ?= $(ROCM_PATH)/bin/hipcc
+	OBJS	  += ggml-cuda.o
+ggml.o:      CFLAGS    += -DGGML_USE_HIPBLAS -D__HIP_PLATFORM_AMD__ -I$(ROCM_PATH)/include
+ggml-cuda.o: CXXFLAGS += -march=native -D__HIP_PLATFORM_AMD__ -I$(ROCMPATH)/include
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(HIPCC) $(CXXFLAGS) -x hip $(HIPFLAGS) -c -o $@ $<
 endif

From d3e1984ce0df5af62ab69c1bdd55a743af4157cc Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Fri, 21 Apr 2023 03:32:06 +0300
Subject: [PATCH 04/51] add rpath

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 5b856a3677b96..d27f716ec1fe1 100644
--- a/Makefile
+++ b/Makefile
@@ -109,7 +109,7 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 endif
 ifdef LLAMA_HIPBLAS
 	ROCM_PATH ?= /opt/rocm
-	LDFLAGS   += -lhipblas -lamdhip64 -L$(ROCM_PATH)/lib
+	LDFLAGS   += -lhipblas -lamdhip64 -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
 	HIPCC     ?= $(ROCM_PATH)/bin/hipcc
 	OBJS	  += ggml-cuda.o
 ggml.o:      CFLAGS    += -DGGML_USE_HIPBLAS -D__HIP_PLATFORM_AMD__ -I$(ROCM_PATH)/include

From 367723544c2187a2a6cd5954ca37a8faf5335e5a Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Sat, 22 Apr 2023 23:28:00 +0300
Subject: [PATCH 05/51] More build file changes

---
 CMakeLists.txt | 15 ++++++++++++---
 Makefile       | 21 ++++++++++++---------
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cea51078dcdd9..2c1958f6acfbc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -170,16 +170,24 @@ if (LLAMA_CUBLAS)
 endif()
 
 if (LLAMA_HIPBLAS)
+    list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
+
+    if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
+        message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
+    endif()
+    if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+        message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
+    endif()
+
     find_package(hip)
     find_package(hipblas)
 
     if (${hipblas_FOUND} AND ${hip_FOUND})
         message(STATUS "hipBLAS found")
         add_compile_definitions(GGML_USE_HIPBLAS)
-        enable_language(HIP)
         add_library(ggml-hip OBJECT ggml-cuda.cu ggml-cuda.h)
-        set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE HIP)
-        target_link_libraries(ggml-hip PRIVATE hip::device)
+        set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
+        target_link_libraries(ggml-hip PUBLIC hip::device)
 
         if (LLAMA_STATIC)
             message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
@@ -188,6 +196,7 @@ if (LLAMA_HIPBLAS)
     else()
         message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
     endif()
+
 endif()
 
 if (LLAMA_ALL_WARNINGS)
diff --git a/Makefile b/Makefile
index d27f716ec1fe1..5339d5765082c 100644
--- a/Makefile
+++ b/Makefile
@@ -13,8 +13,8 @@ ifndef UNAME_M
 UNAME_M := $(shell uname -m)
 endif
 
-CCV := $(shell $(CC) --version | head -n 1)
-CXXV := $(shell $(CXX) --version | head -n 1)
+CCV  = $(shell $(CC)  --version | head -n 1)
+CXXV = $(shell $(CXX) --version | head -n 1)
 
 # Mac OS + Arm can report x86_64
 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
@@ -108,14 +108,17 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	nvcc -arch=native -c -o $@ $<
 endif
 ifdef LLAMA_HIPBLAS
-	ROCM_PATH ?= /opt/rocm
-	LDFLAGS   += -lhipblas -lamdhip64 -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
-	HIPCC     ?= $(ROCM_PATH)/bin/hipcc
-	OBJS	  += ggml-cuda.o
-ggml.o:      CFLAGS    += -DGGML_USE_HIPBLAS -D__HIP_PLATFORM_AMD__ -I$(ROCM_PATH)/include
-ggml-cuda.o: CXXFLAGS += -march=native -D__HIP_PLATFORM_AMD__ -I$(ROCMPATH)/include
+	ROCM_PATH  ?= /opt/rocm
+	CC         := $(ROCM_PATH)/llvm/bin/clang
+	CXX        := $(ROCM_PATH)/llvm/bin/clang++
+	GPU_TARGETS!= $(ROCM_PATH)/llvm/bin/offload-arch
+	CFLAGS     += -DGGML_USE_HIPBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
+	CXXFLAGS   += -DGGML_USE_HIPBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
+	LDFLAGS    += -L/opt/rocm/lib -lhipblas -lamdhip64
+	OBJS       += ggml-cuda.o
+ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
-	$(HIPCC) $(CXXFLAGS) -x hip $(HIPFLAGS) -c -o $@ $<
+	$(CXX) $(CXXFLAGS) -x hip -c -o $@ $<
 endif
 ifdef LLAMA_GPROF
 	CFLAGS   += -pg

From 3a004b2a0166e412d8d54052c50bfd093611ad95 Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Mon, 24 Apr 2023 02:24:54 +0300
Subject: [PATCH 06/51] add rpath

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index e9b9018acd726..4af8aa78ad878 100644
--- a/Makefile
+++ b/Makefile
@@ -120,7 +120,7 @@ ifdef LLAMA_HIPBLAS
 	GPU_TARGETS!= $(ROCM_PATH)/llvm/bin/offload-arch
 	CFLAGS     += -DGGML_USE_HIPBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
 	CXXFLAGS   += -DGGML_USE_HIPBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
-	LDFLAGS    += -L/opt/rocm/lib -lhipblas -lamdhip64
+	LDFLAGS    += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64
 	OBJS       += ggml-cuda.o
 ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h

From 608aa33d9f0ee8a7183ed4f9fb62532a65f5b097 Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Tue, 25 Apr 2023 21:15:04 +0300
Subject: [PATCH 07/51] change default GPU arch to match CMake

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 4af8aa78ad878..bd0139ed4c695 100644
--- a/Makefile
+++ b/Makefile
@@ -117,7 +117,7 @@ ifdef LLAMA_HIPBLAS
 	ROCM_PATH  ?= /opt/rocm
 	CC         := $(ROCM_PATH)/llvm/bin/clang
 	CXX        := $(ROCM_PATH)/llvm/bin/clang++
-	GPU_TARGETS!= $(ROCM_PATH)/llvm/bin/offload-arch
+	GPU_TARGETS = gfx900 gfx906 gfx908 gfx90a gfx1030
 	CFLAGS     += -DGGML_USE_HIPBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
 	CXXFLAGS   += -DGGML_USE_HIPBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
 	LDFLAGS    += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64

From ecc056519fd08363922875b23956a13a7b6fbdcf Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Fri, 28 Apr 2023 01:58:27 +0300
Subject: [PATCH 08/51] only .cu file needs to be complied as device

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cf087905cbf37..b1fd6e218c661 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -187,7 +187,7 @@ if (LLAMA_HIPBLAS)
         add_compile_definitions(GGML_USE_HIPBLAS)
         add_library(ggml-hip OBJECT ggml-cuda.cu ggml-cuda.h)
         set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
-        target_link_libraries(ggml-hip PUBLIC hip::device)
+        target_link_libraries(ggml-hip PRIVATE hip::device)
 
         if (LLAMA_STATIC)
             message(FATAL_ERROR "Static linking not supported for HIP/ROCm")

From a1caa486113eb3d1192c6d554feaff7419194313 Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Fri, 28 Apr 2023 10:08:21 +0300
Subject: [PATCH 09/51] add more cuda defines

This is so 'slaren/cuda-f16f32' would merge.
---
 ggml-cuda.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/ggml-cuda.h b/ggml-cuda.h
index c00d83ba64748..6ab5b3944301d 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -6,19 +6,28 @@
 #define CUBLAS_OP_N HIPBLAS_OP_N
 #define CUBLAS_OP_T HIPBLAS_OP_T
 #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_TF32_TENSOR_OP_MATH 0
 #define cublasCreate hipblasCreate
 #define cublasGemmEx hipblasGemmEx
 #define cublasHandle_t hipblasHandle_t
+#define cublasSetMathMode(h, m) HIPBLAS_STATUS_SUCCESS
 #define cublasSetStream hipblasSetStream
 #define cublasSgemm hipblasSgemm
 #define cublasStatus_t hipblasStatus_t
 #define CUDA_R_16F  HIPBLAS_R_16F
 #define CUDA_R_32F  HIPBLAS_R_32F
+#define cudaDeviceSynchronize hipDeviceSynchronize
 #define cudaError_t hipError_t
+#define cudaEvent_t hipEvent_t
+#define cudaEventCreateWithFlags hipEventCreateWithFlags
+#define cudaEventDisableTiming hipEventDisableTiming
+#define cudaEventRecord hipEventRecord
 #define cudaFree hipFree
+#define cudaFreeHost hipFreeHost
 #define cudaGetErrorString hipGetErrorString
 #define cudaGetLastError hipGetLastError
 #define cudaMalloc hipMalloc
+#define cudaMallocHost hipMallocHost
 #define cudaMemcpyAsync hipMemcpyAsync
 #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
 #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
@@ -26,6 +35,7 @@
 #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
 #define cudaStreamNonBlocking hipStreamNonBlocking
 #define cudaStreamSynchronize hipStreamSynchronize
+#define cudaStreamWaitEvent hipStreamWaitEvent
 #define cudaSuccess hipSuccess
 #define GGML_USE_CUBLAS
 #else

From 04c0d480d780b7e43f9cd5726b1c1d66570b57d8 Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Thu, 4 May 2023 12:31:16 +0300
Subject: [PATCH 10/51] Move all HIP stuff to ggml-cuda.cu

---
 CMakeLists.txt | 10 +++++-----
 ggml-cuda.cu   | 44 +++++++++++++++++++++++++++++++++++++++++---
 ggml-cuda.h    | 46 ----------------------------------------------
 3 files changed, 46 insertions(+), 54 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e01bb2edd4815..79393a54e4ee9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -232,16 +232,16 @@ if (LLAMA_HIPBLAS)
     find_package(hipblas)
 
     if (${hipblas_FOUND} AND ${hip_FOUND})
-        message(STATUS "hipBLAS found")
-        add_compile_definitions(GGML_USE_HIPBLAS)
-        add_library(ggml-hip OBJECT ggml-cuda.cu ggml-cuda.h)
+        message(STATUS "HIP and hipBLAS found")
+        add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
+        add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
         set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
-        target_link_libraries(ggml-hip PRIVATE hip::device)
+        target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::hipblas)
 
         if (LLAMA_STATIC)
             message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
         endif()
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::host roc::hipblas ggml-hip)
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-rocm)
     else()
         message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
     endif()
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 033c7d5c88ff0..9007f6dcbf626 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -5,9 +5,47 @@
 #include <atomic>
 
 #if defined(GGML_USE_HIPBLAS)
-#include "hip/hip_runtime.h"
-#include "hipblas/hipblas.h"
-#include "hip/hip_fp16.h"
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+#include <hip/hip_fp16.h>
+#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
+#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_OP_N HIPBLAS_OP_N
+#define CUBLAS_OP_T HIPBLAS_OP_T
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_TF32_TENSOR_OP_MATH 0
+#define CUDA_R_16F  HIPBLAS_R_16F
+#define CUDA_R_32F  HIPBLAS_R_32F
+#define cublasCreate hipblasCreate
+#define cublasGemmEx hipblasGemmEx
+#define cublasHandle_t hipblasHandle_t
+#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
+#define cublasSetStream hipblasSetStream
+#define cublasSgemm hipblasSgemm
+#define cublasStatus_t hipblasStatus_t
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaError_t hipError_t
+#define cudaEventCreateWithFlags hipEventCreateWithFlags
+#define cudaEventDisableTiming hipEventDisableTiming
+#define cudaEventRecord hipEventRecord
+#define cudaEvent_t hipEvent_t
+#define cudaFree hipFree
+#define cudaFreeHost hipHostFree
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetLastError hipGetLastError
+#define cudaMalloc hipMalloc
+#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocPortable)
+#define cudaMemcpy2DAsync hipMemcpy2DAsync
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
+#define cudaStreamNonBlocking hipStreamNonBlocking
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaStreamWaitEvent hipStreamWaitEvent
+#define cudaStream_t hipStream_t
+#define cudaSuccess hipSuccess
 #else
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
diff --git a/ggml-cuda.h b/ggml-cuda.h
index 0e740e30908bc..f7d6a8bc1842a 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -1,49 +1,3 @@
-#if defined(GGML_USE_HIPBLAS)
-#include "hipblas/hipblas.h"
-#include "hip/hip_runtime.h"
-#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
-#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
-#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
-#define CUBLAS_OP_N HIPBLAS_OP_N
-#define CUBLAS_OP_T HIPBLAS_OP_T
-#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
-#define CUBLAS_TF32_TENSOR_OP_MATH 0
-#define CUDA_R_16F  HIPBLAS_R_16F
-#define CUDA_R_32F  HIPBLAS_R_32F
-#define cublasCreate hipblasCreate
-#define cublasGemmEx hipblasGemmEx
-#define cublasHandle_t hipblasHandle_t
-#define cublasSetMathMode(h, m) HIPBLAS_STATUS_SUCCESS
-#define cublasSetStream hipblasSetStream
-#define cublasSgemm hipblasSgemm
-#define cublasStatus_t hipblasStatus_t
-#define cudaDeviceSynchronize hipDeviceSynchronize
-#define cudaError_t hipError_t
-#define cudaEventCreateWithFlags hipEventCreateWithFlags
-#define cudaEventDisableTiming hipEventDisableTiming
-#define cudaEventRecord hipEventRecord
-#define cudaEvent_t hipEvent_t
-#define cudaFree hipFree
-#define cudaFreeHost hipFreeHost
-#define cudaGetErrorString hipGetErrorString
-#define cudaGetLastError hipGetLastError
-#define cudaMalloc hipMalloc
-#define cudaMallocHost hipMallocHost
-#define cudaMemcpy2DAsync hipMemcpy2DAsync
-#define cudaMemcpyAsync hipMemcpyAsync
-#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
-#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
-#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
-#define cudaStreamNonBlocking hipStreamNonBlocking
-#define cudaStreamSynchronize hipStreamSynchronize
-#define cudaStreamWaitEvent hipStreamWaitEvent
-#define cudaStream_t hipStream_t
-#define cudaSuccess hipSuccess
-#define GGML_USE_CUBLAS
-#else
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
-#endif
 #include "ggml.h"
 
 #ifdef  __cplusplus

From baeb482a9429cb7d962da34e9820e62d14ffbe31 Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Sun, 7 May 2023 12:24:12 +0300
Subject: [PATCH 11/51] Revert to default copy

---
 ggml-cuda.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 9007f6dcbf626..7760f0de133a5 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -35,7 +35,7 @@
 #define cudaGetErrorString hipGetErrorString
 #define cudaGetLastError hipGetLastError
 #define cudaMalloc hipMalloc
-#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocPortable)
+#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
 #define cudaMemcpy2DAsync hipMemcpy2DAsync
 #define cudaMemcpyAsync hipMemcpyAsync
 #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost

From 070cbcc1bd7f1b5049feec43507a320d22aac815 Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Sun, 7 May 2023 18:10:56 +0300
Subject: [PATCH 12/51] occupanct function

---
 ggml-cuda.cu | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 7760f0de133a5..1b862fe82dff2 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -26,10 +26,10 @@
 #define cublasStatus_t hipblasStatus_t
 #define cudaDeviceSynchronize hipDeviceSynchronize
 #define cudaError_t hipError_t
+#define cudaEvent_t hipEvent_t
 #define cudaEventCreateWithFlags hipEventCreateWithFlags
 #define cudaEventDisableTiming hipEventDisableTiming
 #define cudaEventRecord hipEventRecord
-#define cudaEvent_t hipEvent_t
 #define cudaFree hipFree
 #define cudaFreeHost hipHostFree
 #define cudaGetErrorString hipGetErrorString
@@ -40,11 +40,12 @@
 #define cudaMemcpyAsync hipMemcpyAsync
 #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
 #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
+#define cudaStream_t hipStream_t
 #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
 #define cudaStreamNonBlocking hipStreamNonBlocking
 #define cudaStreamSynchronize hipStreamSynchronize
 #define cudaStreamWaitEvent hipStreamWaitEvent
-#define cudaStream_t hipStream_t
 #define cudaSuccess hipSuccess
 #else
 #include <cuda_runtime.h>

From 0fe6384755b478bd57c38e626db36f144c617b40 Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Fri, 12 May 2023 17:22:11 +0300
Subject: [PATCH 13/51] fix makefile

---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 80413517ff8f2..976eefab4f783 100644
--- a/Makefile
+++ b/Makefile
@@ -140,8 +140,8 @@ ifdef LLAMA_HIPBLAS
 	CC         := $(ROCM_PATH)/llvm/bin/clang
 	CXX        := $(ROCM_PATH)/llvm/bin/clang++
 	GPU_TARGETS = gfx900 gfx906 gfx908 gfx90a gfx1030
-	CFLAGS     += -DGGML_USE_HIPBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
-	CXXFLAGS   += -DGGML_USE_HIPBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
+	CFLAGS     += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
+	CXXFLAGS   += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
 	LDFLAGS    += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64
 	OBJS       += ggml-cuda.o
 ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))

From b19fefef943d974db2eda8a8908e67e1d08e317c Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Sat, 20 May 2023 23:28:08 +0300
Subject: [PATCH 14/51] Forwardcompat

---
 ggml-cuda.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 7f0975a615c5a..44d0fa0489ccb 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -17,6 +17,7 @@
 #define CUBLAS_TF32_TENSOR_OP_MATH 0
 #define CUDA_R_16F  HIPBLAS_R_16F
 #define CUDA_R_32F  HIPBLAS_R_32F
+#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
 #define cublasCreate hipblasCreate
 #define cublasGemmEx hipblasGemmEx
 #define cublasHandle_t hipblasHandle_t

From 600ace39c8f1d311b8f3c49003f5a6448a44b18e Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Sat, 20 May 2023 23:42:20 +0300
Subject: [PATCH 15/51] update warp size

---
 ggml-cuda.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 44d0fa0489ccb..64ddc68ccd6d0 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -132,7 +132,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
 
 #define CUDA_MUL_BLOCK_SIZE 256
 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
-#define CUDA_DMMV_BLOCK_SIZE 32 // dmmv = dequantize_mul_mat_vec
+#define CUDA_DMMV_BLOCK_SIZE 64 // dmmv = dequantize_mul_mat_vec
 
 static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
     const int i = blockDim.x*blockIdx.x + threadIdx.x;

From a593a4f6c24389528a5eed8e6dc86eb06ced38b8 Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Fri, 26 May 2023 00:55:28 +0300
Subject: [PATCH 16/51] Add missing parameters

---
 CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 18b67b6699c1d..7c0fb0573d2fc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -233,6 +233,8 @@ if (LLAMA_HIPBLAS)
         message(STATUS "HIP and hipBLAS found")
         add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
         add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
+        target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
+        target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
         set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
         target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::hipblas)
 

From 30d921af3e0b21f511652c98448ccb631434d0d4 Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Fri, 26 May 2023 01:03:56 +0300
Subject: [PATCH 17/51] and makefile

---
 Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Makefile b/Makefile
index 6bb362cb20d6f..ea6ee20414b2e 100644
--- a/Makefile
+++ b/Makefile
@@ -169,6 +169,8 @@ ifdef LLAMA_HIPBLAS
 	LDFLAGS    += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64
 	OBJS       += ggml-cuda.o
 ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
+ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=64
+ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_Y=1
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(CXX) $(CXXFLAGS) -x hip -c -o $@ $<
 endif

From 4c8b3fb1071dff0cd0c4b4f96e506294ba6473f4 Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Fri, 26 May 2023 01:08:53 +0300
Subject: [PATCH 18/51] add configurable vars

---
 Makefile | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index ea6ee20414b2e..a5dd2a3042b2d 100644
--- a/Makefile
+++ b/Makefile
@@ -164,13 +164,15 @@ ifdef LLAMA_HIPBLAS
 	CC         := $(ROCM_PATH)/llvm/bin/clang
 	CXX        := $(ROCM_PATH)/llvm/bin/clang++
 	GPU_TARGETS = gfx900 gfx906 gfx908 gfx90a gfx1030
+	LLAMA_CUDA_DMMV_X ?= 64
+	LLAMA_CUDA_DMMV_Y ?= 1
 	CFLAGS     += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
 	CXXFLAGS   += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
 	LDFLAGS    += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64
 	OBJS       += ggml-cuda.o
 ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
-ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=64
-ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_Y=1
+ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
+ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y)
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(CXX) $(CXXFLAGS) -x hip -c -o $@ $<
 endif

From 9fdaa1d2501a2c4a030af6d34e97b2e4766b27c4 Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Sat, 27 May 2023 19:17:53 +0300
Subject: [PATCH 19/51] Add more defs

For forward compatibility #1607
---
 ggml-cuda.cu | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 4b4c678ead265..1253f086189ba 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -25,14 +25,18 @@
 #define cublasSetStream hipblasSetStream
 #define cublasSgemm hipblasSgemm
 #define cublasStatus_t hipblasStatus_t
+#define cudaDeviceProp hipDeviceProp_t
 #define cudaDeviceSynchronize hipDeviceSynchronize
 #define cudaError_t hipError_t
-#define cudaEvent_t hipEvent_t
 #define cudaEventCreateWithFlags hipEventCreateWithFlags
 #define cudaEventDisableTiming hipEventDisableTiming
 #define cudaEventRecord hipEventRecord
+#define cudaEvent_t hipEvent_t
 #define cudaFree hipFree
 #define cudaFreeHost hipHostFree
+#define cudaGetDevice hipGetDevice
+#define cudaGetDeviceCount hipGetDeviceCount
+#define cudaGetDeviceProperties hipGetDeviceProperties
 #define cudaGetErrorString hipGetErrorString
 #define cudaGetLastError hipGetLastError
 #define cudaMalloc hipMalloc
@@ -43,11 +47,12 @@
 #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
 #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
 #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
-#define cudaStream_t hipStream_t
+#define cudaSetDevice hipSetDevice
 #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
 #define cudaStreamNonBlocking hipStreamNonBlocking
 #define cudaStreamSynchronize hipStreamSynchronize
 #define cudaStreamWaitEvent hipStreamWaitEvent
+#define cudaStream_t hipStream_t
 #define cudaSuccess hipSuccess
 #else
 #include <cuda_runtime.h>

From 5d6eb72164e5ae000d07dd725e635faa7a2f723d Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Tue, 6 Jun 2023 18:32:41 +0300
Subject: [PATCH 20/51] warp size fixes

---
 ggml-cuda.cu | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 8b2fc690e03e4..3a5e1527fb5f7 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -182,7 +182,11 @@ typedef struct {
 } block_q6_k;
 static_assert(sizeof(block_q6_k) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_k block size/padding");
 
+#if defined(GGML_USE_HIPBLAS)
+#define WARP_SIZE warpSize
+#else
 #define WARP_SIZE 32
+#endif
 
 #define CUDA_MUL_BLOCK_SIZE 256
 
@@ -679,8 +683,8 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
     // sum up partial sums and write back result
     __syncthreads();
 #pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    for (int mask = WARP_SIZE/2; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, WARP_SIZE);
     }
 
     if (tid == 0) {

From 1ba4ce4ad792f9672eecc37bf982386d3a007914 Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Tue, 6 Jun 2023 18:41:08 +0300
Subject: [PATCH 21/51] Revert "warp size fixes"

It seems like 32 is faster for me, at least and it won't cause so many conflicts.

This reverts commit 5d6eb72164e5ae000d07dd725e635faa7a2f723d.
---
 ggml-cuda.cu | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 3a5e1527fb5f7..8b2fc690e03e4 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -182,11 +182,7 @@ typedef struct {
 } block_q6_k;
 static_assert(sizeof(block_q6_k) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_k block size/padding");
 
-#if defined(GGML_USE_HIPBLAS)
-#define WARP_SIZE warpSize
-#else
 #define WARP_SIZE 32
-#endif
 
 #define CUDA_MUL_BLOCK_SIZE 256
 
@@ -683,8 +679,8 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
     // sum up partial sums and write back result
     __syncthreads();
 #pragma unroll
-    for (int mask = WARP_SIZE/2; mask > 0; mask >>= 1) {
-        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, WARP_SIZE);
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
     }
 
     if (tid == 0) {

From fa5b3d7365266a9903450c1105551ffec7f51d92 Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Tue, 6 Jun 2023 18:47:00 +0300
Subject: [PATCH 22/51] fix makefile.

---
 Makefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Makefile b/Makefile
index 94946d6f92522..0b2849712af89 100644
--- a/Makefile
+++ b/Makefile
@@ -196,7 +196,6 @@ ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
 ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y)
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(CXX) $(CXXFLAGS) -x hip -c -o $@ $<
-endif
 endif # LLAMA_HIPBLAS
 
 ifdef LLAMA_METAL

From 61df8e92179b84af9041e53f61d0194dfd791de0 Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Wed, 14 Jun 2023 22:46:10 +0300
Subject: [PATCH 23/51] add cudaMemset

---
 ggml-cuda.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index fe55cc8cf2743..e54ea6d469863 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -50,6 +50,7 @@
 #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
 #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
 #define cudaMemcpyKind hipMemcpyKind
+#define cudaMemset hipMemset
 #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
 #define cudaSetDevice hipSetDevice
 #define cudaStreamCreateWithFlags hipStreamCreateWithFlags

From bb16effc750e2706050f5d4ec89cecc42cc13882 Mon Sep 17 00:00:00 2001
From: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Date: Wed, 28 Jun 2023 15:27:10 -0500
Subject: [PATCH 24/51] headers fix; add kquants_iter for hipblas and add
 gfx803 (#1)

* kquants_iter for hipblas and add gfx803
* Update CMakeLists.txt with hipblas kquants_iter and DMMV_F16
* remove dmmv_f16 for now
---
 CMakeLists.txt |  1 +
 Makefile       | 11 ++++++++---
 ggml.c         |  6 ++++--
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 296f5043001db..23c28c3589ac1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -335,6 +335,7 @@ if (LLAMA_HIPBLAS)
         target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
         set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
         target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::hipblas)
+        add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
 
         if (LLAMA_STATIC)
             message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
diff --git a/Makefile b/Makefile
index f49c57edf5ece..49bbfaf4e72fb 100644
--- a/Makefile
+++ b/Makefile
@@ -21,8 +21,8 @@ ifndef UNAME_M
 UNAME_M := $(shell uname -m)
 endif
 
-CCV  = $(shell $(CC)  --version | head -n 1)
-CXXV = $(shell $(CXX) --version | head -n 1)
+CCV := $(shell $(CC) --version | head -n 1)
+CXXV := $(shell $(CXX) --version | head -n 1)
 
 # Mac OS + Arm can report x86_64
 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
@@ -207,13 +207,18 @@ ifdef LLAMA_HIPBLAS
 	ROCM_PATH  ?= /opt/rocm
 	CC         := $(ROCM_PATH)/llvm/bin/clang
 	CXX        := $(ROCM_PATH)/llvm/bin/clang++
-	GPU_TARGETS = gfx900 gfx906 gfx908 gfx90a gfx1030
+	GPU_TARGETS = gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030
 	LLAMA_CUDA_DMMV_X ?= 32
 	LLAMA_CUDA_DMMV_Y ?= 1
 	CFLAGS     += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
 	CXXFLAGS   += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
 	LDFLAGS    += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64
 	OBJS       += ggml-cuda.o
+ifdef LLAMA_CUDA_KQUANTS_ITER
+	CXXFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
+else
+	CXXFLAGS += -DK_QUANTS_PER_ITERATION=2
+endif
 ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
 ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
 ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y)
diff --git a/ggml.c b/ggml.c
index 89379516e2bcd..5713a9f43569f 100644
--- a/ggml.c
+++ b/ggml.c
@@ -230,9 +230,11 @@ inline static void* ggml_aligned_malloc(size_t size) {
 #endif
 #elif defined(GGML_USE_OPENBLAS)
 #include <cblas.h>
-#elif defined(GGML_USE_CUBLAS) | defined(GGML_USE_HIPBLAS)
+#endif
+#if defined(GGML_USE_CUBLAS)
 #include "ggml-cuda.h"
-#elif defined(GGML_USE_CLBLAST)
+#endif
+#if defined(GGML_USE_CLBLAST)
 #include "ggml-opencl.h"
 #endif
 

From c3e3733c61f7705ea00fd593ee94527da8c12f1b Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Sun, 2 Jul 2023 15:51:31 +0300
Subject: [PATCH 25/51] ROCm fixes

---
 ggml-cuda.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index dca90d9997ea7..8fc37ba1bb9fb 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -34,6 +34,7 @@
 #define cudaEventDisableTiming hipEventDisableTiming
 #define cudaEventRecord hipEventRecord
 #define cudaEvent_t hipEvent_t
+#define cudaEventDestroy hipEventDestroy
 #define cudaFree hipFree
 #define cudaFreeHost hipHostFree
 #define cudaGetDevice hipGetDevice
@@ -56,7 +57,7 @@
 #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
 #define cudaStreamNonBlocking hipStreamNonBlocking
 #define cudaStreamSynchronize hipStreamSynchronize
-#define cudaStreamWaitEvent hipStreamWaitEvent
+#define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
 #define cudaStream_t hipStream_t
 #define cudaSuccess hipSuccess
 #else

From e610466307abc8f8bae641682ab3f91dbc33930e Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Tue, 11 Jul 2023 17:53:14 +0300
Subject: [PATCH 26/51] Expand arch list and make it overrideable

---
 Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 38d65aebc6c01..d3fc7c4c6c785 100644
--- a/Makefile
+++ b/Makefile
@@ -213,10 +213,10 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
 endif # LLAMA_CLBLAST
 
 ifdef LLAMA_HIPBLAS
-	ROCM_PATH  ?= /opt/rocm
-	CC         := $(ROCM_PATH)/llvm/bin/clang
-	CXX        := $(ROCM_PATH)/llvm/bin/clang++
-	GPU_TARGETS = gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030
+	ROCM_PATH   ?= /opt/rocm
+	CC          := $(ROCM_PATH)/llvm/bin/clang
+	CXX         := $(ROCM_PATH)/llvm/bin/clang++
+	GPU_TARGETS ?= gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100
 	LLAMA_CUDA_DMMV_X ?= 32
 	LLAMA_CUDA_DMMV_Y ?= 1
 	CFLAGS     += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)

From afcb8fe0c4f5e918422ea41d08824653d58575ed Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Tue, 11 Jul 2023 18:09:27 +0300
Subject: [PATCH 27/51] Add new config option

---
 CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 34d4a33fe4fd2..54c091413d987 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -367,6 +367,9 @@ if (LLAMA_HIPBLAS)
         target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
         target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
         target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
+        if (LLAMA_CUDA_FORCE_DMMV)
+            target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
+        endif()
         set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
         target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::hipblas)
 

From 2ec4466db54fd2f42f2ab7713cc1061e0cf59bf3 Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Thu, 13 Jul 2023 13:44:02 +0300
Subject: [PATCH 28/51] Update build flags.

GGML_CUDA_DMMV_Y is now GGML_CUDA_MMV_Y
so update your build instructions.

GGML_CUDA_FORCE_DMMV is always enabled.

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
---
 CMakeLists.txt |  6 ++----
 Makefile       | 14 ++++++--------
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9f9c55a671c93..016d850f4466e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -365,11 +365,9 @@ if (LLAMA_HIPBLAS)
         add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
         add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
         target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
-        target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
+        target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
         target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
-        if (LLAMA_CUDA_FORCE_DMMV)
-            target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
-        endif()
+        target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
         set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
         target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::hipblas)
 
diff --git a/Makefile b/Makefile
index 88cc288aadb20..039a75365d18e 100644
--- a/Makefile
+++ b/Makefile
@@ -226,20 +226,18 @@ ifdef LLAMA_HIPBLAS
 	CC          := $(ROCM_PATH)/llvm/bin/clang
 	CXX         := $(ROCM_PATH)/llvm/bin/clang++
 	GPU_TARGETS ?= gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100
-	LLAMA_CUDA_DMMV_X ?= 32
-	LLAMA_CUDA_DMMV_Y ?= 1
+	LLAMA_CUDA_DMMV_X       ?= 32
+	LLAMA_CUDA_MMV_Y        ?= 1
+	LLAMA_CUDA_KQUANTS_ITER ?= 2
 	CFLAGS     += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
 	CXXFLAGS   += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
 	LDFLAGS    += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64
 	OBJS       += ggml-cuda.o
-ifdef LLAMA_CUDA_KQUANTS_ITER
-	CXXFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
-else
-	CXXFLAGS += -DK_QUANTS_PER_ITERATION=2
-endif
 ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
 ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
-ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y)
+ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
+ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_FORCE_DMMV
+ggml-cuda.o: CXXFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(CXX) $(CXXFLAGS) -x hip -c -o $@ $<
 endif # LLAMA_HIPBLAS

From 1f6294dc4473701b5be791d47e4b3733f95dbc0a Mon Sep 17 00:00:00 2001
From: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Date: Mon, 24 Jul 2023 03:52:01 -0500
Subject: [PATCH 29/51] Fix multi GPU on multiple amd architectures with
 rocblas_initialize() (#5)

* initialize rocblas
---
 ggml-cuda.cu | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index f6426d4bad168..cac029b480b7a 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -10,6 +10,7 @@
 #include <hip/hip_runtime.h>
 #include <hipblas/hipblas.h>
 #include <hip/hip_fp16.h>
+#include "rocblas/rocblas.h"
 #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
 #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
 #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
@@ -2531,6 +2532,10 @@ void ggml_init_cublas() {
     static bool initialized = false;
 
     if (!initialized) {
+#ifdef GGML_USE_HIPBLAS
+    rocblas_initialize();
+    hipDeviceSynchronize();
+#endif
         CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
         GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
         int64_t total_vram = 0;

From 8e8054ad83e794b261914ad4f337d43e2c76882d Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Mon, 24 Jul 2023 12:20:49 +0300
Subject: [PATCH 30/51] Add rocblas to build files

---
 CMakeLists.txt | 3 ++-
 Makefile       | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 016d850f4466e..0488443249560 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -359,6 +359,7 @@ if (LLAMA_HIPBLAS)
 
     find_package(hip)
     find_package(hipblas)
+    find_package(rocblas)
 
     if (${hipblas_FOUND} AND ${hip_FOUND})
         message(STATUS "HIP and hipBLAS found")
@@ -369,7 +370,7 @@ if (LLAMA_HIPBLAS)
         target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
         target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
         set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
-        target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::hipblas)
+        target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
 
         if (LLAMA_STATIC)
             message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
diff --git a/Makefile b/Makefile
index 4adaaaad87922..5cf0943fac552 100644
--- a/Makefile
+++ b/Makefile
@@ -228,7 +228,7 @@ ifdef LLAMA_HIPBLAS
 	LLAMA_CUDA_KQUANTS_ITER ?= 2
 	CFLAGS     += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
 	CXXFLAGS   += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
-	LDFLAGS    += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64
+	LDFLAGS    += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 -lrocblas
 	OBJS       += ggml-cuda.o
 ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
 ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)

From f8e3fc6c746b37d69656fb5ae6af8e411d85dbca Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Sat, 29 Jul 2023 14:16:46 +0300
Subject: [PATCH 31/51] rocblas init stuff

---
 ggml-cuda.cu | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 91e6c078ecc45..cd122c5be6155 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -10,7 +10,10 @@
 #include <hip/hip_runtime.h>
 #include <hipblas/hipblas.h>
 #include <hip/hip_fp16.h>
+#ifdef __HIP_PLATFORM_AMD__
+// for rocblas_initialize()
 #include "rocblas/rocblas.h"
+#endif
 #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
 #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
 #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
@@ -2746,10 +2749,14 @@ void ggml_init_cublas() {
     static bool initialized = false;
 
     if (!initialized) {
-#ifdef GGML_USE_HIPBLAS
-    rocblas_initialize();
-    hipDeviceSynchronize();
+
+#ifdef __HIP_PLATFORM_AMD__
+        // Workaround for a rocBLAS bug when using multiple graphics cards:
+        // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
+        rocblas_initialize();
+        CUDA_CHECK(cudaDeviceSynchronize());
 #endif
+
         CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
         GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
         int64_t total_vram = 0;

From 4336231a32a0c6168da5d79801752289622e9e58 Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Sat, 29 Jul 2023 18:35:56 +0300
Subject: [PATCH 32/51] add hipBLAS to README

---------

Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
---
 README.md | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/README.md b/README.md
index 6a3268d129b55..05c9d3b5df799 100644
--- a/README.md
+++ b/README.md
@@ -408,6 +408,35 @@ Building the program with BLAS support may lead to some performance improvements
   | LLAMA_CUDA_DMMV_F16     | Boolean                |   false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. |
   | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 |       2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
 
+- #### hipBLAS
+
+  This provide BLAS acceleation on HIP supported GPU like AMD GPU.
+  Make sure to have ROCm installed.
+  You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
+  Windows support is coming soon...
+
+  - Using `make`:
+    ```bash
+    make LLAMA_HIPBLAS=1
+    ```
+  - Using `CMake`:
+    ```bash
+    mkdir build
+    cd build
+    CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON
+    cmake --build .
+    ```
+
+  The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
+  If your GPU is not officialy supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3.
+  The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
+
+  | Option                  | Legal values           | Default | Description |
+  |-------------------------|------------------------|---------|-------------|
+  | LLAMA_CUDA_DMMV_X       | Positive integer >= 32 |      32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
+  | LLAMA_CUDA_MMV_Y        | Positive integer       |       1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
+  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 |       2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
+
 - #### CLBlast
 
   OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.

From c1cb70d64d307d3fd9b7b9f61bb574e36520499a Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Mon, 31 Jul 2023 19:56:44 +0300
Subject: [PATCH 33/51] new build arg LLAMA_CUDA_MMQ_Y

---
 CMakeLists.txt | 1 +
 Makefile       | 2 ++
 README.md      | 7 ++++---
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 969a3de693dea..14eefe0051670 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -375,6 +375,7 @@ if (LLAMA_HIPBLAS)
         message(STATUS "HIP and hipBLAS found")
         add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
         add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
+        target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y})
         target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
         target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
         target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
diff --git a/Makefile b/Makefile
index 21fed1a46d1a5..cdb12a872f784 100644
--- a/Makefile
+++ b/Makefile
@@ -270,6 +270,7 @@ ifdef LLAMA_HIPBLAS
 	GPU_TARGETS ?= gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100
 	LLAMA_CUDA_DMMV_X       ?= 32
 	LLAMA_CUDA_MMV_Y        ?= 1
+	LLAMA_CUDA_MMQ_Y		?= 64
 	LLAMA_CUDA_KQUANTS_ITER ?= 2
 	CFLAGS     += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
 	CXXFLAGS   += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
@@ -278,6 +279,7 @@ ifdef LLAMA_HIPBLAS
 ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
 ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
 ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
+ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y)
 ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_FORCE_DMMV
 ggml-cuda.o: CXXFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
diff --git a/README.md b/README.md
index a56b64a88dbd0..947c2b6940d26 100644
--- a/README.md
+++ b/README.md
@@ -437,9 +437,10 @@ Building the program with BLAS support may lead to some performance improvements
 
   | Option                  | Legal values           | Default | Description |
   |-------------------------|------------------------|---------|-------------|
-  | LLAMA_CUDA_DMMV_X       | Positive integer >= 32 |      32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
-  | LLAMA_CUDA_MMV_Y        | Positive integer       |       1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
-  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 |       2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
+  | LLAMA_CUDA_MMQ_Y        | Positive integer >= 32 |      64 | Tile size in y direction when using the custom HIP kernels for prompt processing. Higher values can be faster depending on the amount of shared memory available. Power of 2 heavily recommended. |
+  | LLAMA_CUDA_DMMV_X       | Positive integer >= 32 |      32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
+  | LLAMA_CUDA_MMV_Y        | Positive integer       |       1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
+  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 |       2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
 
 - #### CLBlast
 

From d91456aaf138566fa0aa3d507964049c8a09499b Mon Sep 17 00:00:00 2001
From: ardfork <134447697+ardfork@users.noreply.github.com>
Date: Mon, 31 Jul 2023 20:35:00 +0300
Subject: [PATCH 34/51] fix half2 decomposition

---
 ggml-cuda.cu | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index e62891d60f47a..f19c7c7c71ead 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -472,8 +472,8 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
 static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
     const block_q4_1 * x = (const block_q4_1 *) vx;
 
-    const dfloat d = x[ib].dm.x;
-    const dfloat m = x[ib].dm.y;
+    const dfloat d = __low2half(x[ib].dm);
+    const dfloat m = __high2half(x[ib].dm);
 
     const int vui = x[ib].qs[iqs];
 
@@ -515,8 +515,8 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
 static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
     const block_q5_1 * x = (const block_q5_1 *) vx;
 
-    const dfloat d = x[ib].dm.x;
-    const dfloat m = x[ib].dm.y;
+    const dfloat d = __low2half(x[ib].dm);
+    const dfloat m = __high2half(x[ib].dm);
 
     uint32_t qh;
     memcpy(&qh, x[ib].qh, sizeof(qh));
@@ -568,8 +568,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
     const uint8_t q = x[i].qs[32*n + l];
     float * y = yy + i*QK_K + 128*n;
 
-    float dall = x[i].dm.x;
-    float dmin = x[i].dm.y;
+    float dall = __low2half(x[i].dm);
+    float dmin = __high2half(x[i].dm);
     y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
     y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
     y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
@@ -579,8 +579,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
     const int il = tid%16;  // 0...15
     const uint8_t q = x[i].qs[il] >> (2*is);
     float * y = yy + i*QK_K + 16*is + il;
-    float dall = x[i].dm.x;
-    float dmin = x[i].dm.y;
+    float dall = __low2half(x[i].dm);
+    float dmin = __high2half(x[i].dm);
     y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
     y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
 #endif
@@ -666,8 +666,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
 
     float * y = yy + i*QK_K + 64*il + n*ir;
 
-    const float dall = x[i].dm.x;
-    const float dmin = x[i].dm.y;
+    const float dall = __low2half(x[i].dm);
+    const float dmin = __high2half(x[i].dm);
 
     const uint8_t * q = x[i].qs + 32*il + n*ir;
 
@@ -705,8 +705,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
 
     float * y = yy + i*QK_K + 64*il + 2*ir;
 
-    const float dall = x[i].dm.x;
-    const float dmin = x[i].dm.y;
+    const float dall = __low2half(x[i].dm);
+    const float dmin = __high2half(x[i].dm);
 
     const uint8_t * ql = x[i].qs + 32*il + 2*ir;
     const uint8_t * qh = x[i].qh + 2*ir;
@@ -818,8 +818,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
         const float   * y = yy + i * QK_K + y_offset;
         const uint8_t * q = x[i].qs + q_offset;
 
-        const float dall = x[i].dm.x;
-        const float dmin = x[i].dm.y;
+        const float dall = __low2half(x[i].dm);
+        const float dmin = __high2half(x[i].dm);
 
         const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
         aux[0] = a[0] & 0x0f0f0f0f;
@@ -1039,8 +1039,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
         const float   * y1 = yy + i*QK_K + y_offset;
         const float   * y2 = y1 + 128;
 
-        const float dall = x[i].dm.x;
-        const float dmin = x[i].dm.y;
+        const float dall = __low2half(x[i].dm);
+        const float dmin = __high2half(x[i].dm);
 
         const uint16_t * a = (const uint16_t *)x[i].scales;
         aux[0] = a[im+0] & kmask1;
@@ -1172,8 +1172,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
         const float   * y1  = yy + i*QK_K + y_offset;
         const float   * y2  = y1 + 128;
 
-        const float dall = x[i].dm.x;
-        const float dmin = x[i].dm.y;
+        const float dall = __low2half(x[i].dm);
+        const float dmin = __high2half(x[i].dm);
 
         const uint16_t * a = (const uint16_t *)x[i].scales;
         aux[0] = a[im+0] & kmask1;

From 7297128db8159c7b12db4c28a4532b993025c2e5 Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Mon, 7 Aug 2023 08:35:53 +0300
Subject: [PATCH 35/51] [Zig] Rewrite build for Zig 0.11 (#2514)

* zig build fixes

* Disable LTO on Windows.
---
 build.zig | 135 +++++++++++++++++++++++++++++++-----------------------
 1 file changed, 77 insertions(+), 58 deletions(-)

diff --git a/build.zig b/build.zig
index 2287d2a2c442b..04c88d8a2bddc 100644
--- a/build.zig
+++ b/build.zig
@@ -1,68 +1,87 @@
+// Compatible with Zig Version 0.11.0
 const std = @import("std");
-const commit_hash = @embedFile(".git/refs/heads/master");
+const Compile = std.Build.Step.Compile;
+const ConfigHeader = std.Build.Step.ConfigHeader;
+const Mode = std.builtin.Mode;
+const CrossTarget = std.zig.CrossTarget;
 
-// Zig Version: 0.11.0-dev.3986+e05c242cd
-pub fn build(b: *std.build.Builder) void {
-    const target = b.standardTargetOptions(.{});
-    const optimize = b.standardOptimizeOption(.{});
+const Maker = struct {
+    builder: *std.build.Builder,
+    target: CrossTarget,
+    optimize: Mode,
+    config_header: *ConfigHeader,
+
+    const cflags = .{"-std=c11"};
+    const cxxflags = .{"-std=c++11"};
+
+    fn init(builder: *std.build.Builder) Maker {
+        const commit_hash = @embedFile(".git/refs/heads/master");
+        const config_header = builder.addConfigHeader(
+            .{ .style = .blank, .include_path = "build-info.h" },
+            .{
+                .BUILD_NUMBER = 0,
+                .BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline
+            },
+        );
+        return Maker{
+            .builder = builder,
+            .target = builder.standardTargetOptions(.{}),
+            .optimize = builder.standardOptimizeOption(.{}),
+            .config_header = config_header,
+        };
+    }
 
-    const config_header = b.addConfigHeader(
-        .{ .style = .blank, .include_path = "build-info.h" },
-        .{
-            .BUILD_NUMBER = 0,
-            .BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline
-        },
-    );
+    fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
+        const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
+        if (std.mem.endsWith(u8, src, ".c")) {
+            o.addCSourceFiles(&.{src}, &cflags);
+            o.linkLibC();
+        } else {
+            o.addCSourceFiles(&.{src}, &cxxflags);
+            o.linkLibCpp();
+        }
+        o.addIncludePath(.{ .path = "." });
+        o.addIncludePath(.{ .path = "./examples" });
+        return o;
+    }
+
+    fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
+        const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
+        e.addIncludePath(.{ .path = "." });
+        e.addIncludePath(.{ .path = "./examples" });
+        e.addCSourceFiles(&.{src}, &cxxflags);
+        for (deps) |d| e.addObject(d);
+        e.linkLibC();
+        e.linkLibCpp();
+        e.addConfigHeader(m.config_header);
+        m.builder.installArtifact(e);
 
-    const lib = b.addStaticLibrary(.{
-        .name = "llama",
-        .target = target,
-        .optimize = optimize,
-    });
-    lib.linkLibC();
-    lib.linkLibCpp();
-    lib.addIncludePath(".");
-    lib.addIncludePath("./examples");
-    lib.addConfigHeader(config_header);
-    lib.addCSourceFiles(&.{"ggml.c"}, &.{"-std=c11"});
-    lib.addCSourceFiles(&.{"llama.cpp"}, &.{"-std=c++11"});
-    b.installArtifact(lib);
+        // Currently a bug is preventing correct linking for optimized builds for Windows:
+        // https://github.com/ziglang/zig/issues/15958
+        if (e.target.isWindows()) {
+            e.want_lto = false;
+        }
+        return e;
+    }
+};
 
-    const examples = .{
-        "main",
-        "baby-llama",
-        "embedding",
-        "metal",
-        "perplexity",
-        "quantize",
-        "quantize-stats",
-        "save-load-state",
-        "server",
-        "simple",
-        "train-text-from-scratch",
-    };
+pub fn build(b: *std.build.Builder) void {
+    const make = Maker.init(b);
 
-    inline for (examples) |example_name| {
-        const exe = b.addExecutable(.{
-            .name = example_name,
-            .target = target,
-            .optimize = optimize,
-        });
-        exe.addIncludePath(".");
-        exe.addIncludePath("./examples");
-        exe.addConfigHeader(config_header);
-        exe.addCSourceFiles(&.{
-            std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{ example_name, example_name }),
-            "examples/common.cpp",
-        }, &.{"-std=c++11"});
-        exe.linkLibrary(lib);
-        b.installArtifact(exe);
+    const ggml = make.obj("ggml", "ggml.c");
+    const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
+    const llama = make.obj("llama", "llama.cpp");
+    const common = make.obj("common", "examples/common.cpp");
+    const grammar_parser = make.obj("grammar-parser", "examples/grammar-parser.cpp");
 
-        const run_cmd = b.addRunArtifact(exe);
-        run_cmd.step.dependOn(b.getInstallStep());
-        if (b.args) |args| run_cmd.addArgs(args);
+    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
+    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama });
+    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common });
+    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common });
+    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama });
 
-        const run_step = b.step("run-" ++ example_name, "Run the app");
-        run_step.dependOn(&run_cmd.step);
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
+    if (server.target.isWindows()) {
+        server.linkSystemLibrary("ws2_32");
     }
 }

From 34a14b28ff7f3c98730339bacee035091b2a812a Mon Sep 17 00:00:00 2001
From: GiviMAD <GiviMAD@users.noreply.github.com>
Date: Sun, 6 Aug 2023 23:21:46 -0700
Subject: [PATCH 36/51] [Makefile] Move ARM CFLAGS before compilation (#2536)

---
 Makefile | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/Makefile b/Makefile
index e0528aeee24c0..897c5cb9abcca 100644
--- a/Makefile
+++ b/Makefile
@@ -142,6 +142,28 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	#CXXFLAGS += -mssse3
 endif
 
+ifneq ($(filter aarch64%,$(UNAME_M)),)
+	# Apple M1, M2, etc.
+	# Raspberry Pi 3, 4, Zero 2 (64-bit)
+	CFLAGS   += -mcpu=native
+	CXXFLAGS += -mcpu=native
+endif
+
+ifneq ($(filter armv6%,$(UNAME_M)),)
+	# Raspberry Pi 1, Zero
+	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
+endif
+
+ifneq ($(filter armv7%,$(UNAME_M)),)
+	# Raspberry Pi 2
+	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
+endif
+
+ifneq ($(filter armv8%,$(UNAME_M)),)
+	# Raspberry Pi 3, 4, Zero 2 (32-bit)
+	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
+endif
+
 ifneq ($(filter ppc64%,$(UNAME_M)),)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
 	ifneq (,$(findstring POWER9,$(POWER9_M)))
@@ -270,28 +292,6 @@ ifdef LLAMA_METAL
 	OBJS     += ggml-metal.o
 endif # LLAMA_METAL
 
-ifneq ($(filter aarch64%,$(UNAME_M)),)
-	# Apple M1, M2, etc.
-	# Raspberry Pi 3, 4, Zero 2 (64-bit)
-	CFLAGS   += -mcpu=native
-	CXXFLAGS += -mcpu=native
-endif
-
-ifneq ($(filter armv6%,$(UNAME_M)),)
-	# Raspberry Pi 1, Zero
-	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
-endif
-
-ifneq ($(filter armv7%,$(UNAME_M)),)
-	# Raspberry Pi 2
-	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
-endif
-
-ifneq ($(filter armv8%,$(UNAME_M)),)
-	# Raspberry Pi 3, 4, Zero 2 (32-bit)
-	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
-endif
-
 ifdef LLAMA_METAL
 ggml-metal.o: ggml-metal.m ggml-metal.h
 	$(CC) $(CFLAGS) -c $< -o $@

From f6f9896ac3d2ff207e18f87dab85d126ceef5236 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 7 Aug 2023 10:52:57 +0300
Subject: [PATCH 37/51] metal : fix out-of-bounds access + inc concurrency
 nodes (#2416)

* metal : fix out-of-bounds access + style changes

* metal : increase concurrency nodes to 2*GGML_MAX_NODES
---
 ggml-metal.m | 57 +++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 39 insertions(+), 18 deletions(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index 3f098d39677a0..b47a98e214b61 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -7,6 +7,11 @@
 #import <Metal/Metal.h>
 #import <MetalPerformanceShaders/MetalPerformanceShaders.h>
 
+#undef MIN
+#undef MAX
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
 #ifdef GGML_METAL_NDEBUG
 #define metal_printf(...)
 #else
@@ -15,6 +20,8 @@
 
 #define UNUSED(x) (void)(x)
 
+#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
+
 struct ggml_metal_buffer {
     const char * name;
 
@@ -36,7 +43,7 @@
     int n_buffers;
     struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
 
-    int concur_list[GGML_MAX_NODES];
+    int concur_list[GGML_MAX_CONCUR];
     int concur_list_len;
 
     // custom kernels
@@ -370,15 +377,15 @@ void ggml_metal_graph_find_concurrency(
         struct ggml_metal_context * ctx,
         struct ggml_cgraph * gf) {
     int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
-    int nodes_unused[GGML_MAX_NODES];
+    int nodes_unused[GGML_MAX_CONCUR];
 
-    for (int i = 0; i < GGML_MAX_NODES; i++) {ctx->concur_list[i] = 0;}
-    for (int i = 0; i < gf->n_nodes; i++) {nodes_unused[i] = 1;}
+    for (int i = 0; i < GGML_MAX_CONCUR; i++) { ctx->concur_list[i] = 0; }
+    for (int i = 0; i < gf->n_nodes;     i++) { nodes_unused[i]     = 1; }
     ctx->concur_list_len = 0;
 
-    int n_left = gf->n_nodes;
-    int n_start = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list
-    int level_pos = 0;  // at ctx->concur_list, the last layer (level) ends at level_pos
+    int n_left    = gf->n_nodes;
+    int n_start   = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list
+    int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos
 
     while (n_left > 0) {
         // number of nodes at a layer (that can be issued concurrently)
@@ -386,28 +393,40 @@ void ggml_metal_graph_find_concurrency(
         for (int i = n_start; i < ((n_start + search_depth > gf->n_nodes) ? gf->n_nodes : n_start + search_depth); i++) {
             if (nodes_unused[i]) {
                 // if the requirements for gf->nodes[i] are satisfied
-                int exe_flag=1;
+                int exe_flag = 1;
+
                 // scan all srcs
                 for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) {
                     struct ggml_tensor * src_cur = gf->nodes[i]->src[src_ind];
                     if (src_cur) {
                         // if is leaf nodes it's satisfied.
-                        if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {continue;}
+                        // TODO: ggml_is_leaf()
+                        if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {
+                            continue;
+                        }
 
                         // otherwise this src should be the output from previous nodes.
                         int is_found = 0;
+
                         // scan 2*search_depth back because we inserted barrier.
-                        for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
-                            if (gf->nodes[ctx->concur_list[j]] == src_cur) {is_found = 1; break;}
+                        //for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
+                        for (int j = MAX(0, level_pos - 2*search_depth); j < level_pos; j++) {
+                            if (ctx->concur_list[j] >= 0 && gf->nodes[ctx->concur_list[j]] == src_cur) {
+                                is_found = 1;
+                                break;
+                            }
+                        }
+                        if (is_found == 0) {
+                            exe_flag = 0;
+                            break;
                         }
-                        if (is_found == 0) {exe_flag = 0; break;}
                     }
                 }
                 if (exe_flag) {
                     // check if nodes[i]'s data will be overwritten by a node before nodes[i].
                     // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
                     int64_t data_start = (int64_t) gf->nodes[i]->data;
-                    int64_t length = (int64_t) ggml_nbytes(gf->nodes[i]);
+                    int64_t length     = (int64_t) ggml_nbytes(gf->nodes[i]);
                     for (int j = n_start; j < i; j++) {
                         if (nodes_unused[j] && gf->nodes[j]->op != GGML_OP_RESHAPE \
                                             && gf->nodes[j]->op != GGML_OP_VIEW \
@@ -416,9 +435,9 @@ void ggml_metal_graph_find_concurrency(
                             if (((int64_t)gf->nodes[j]->data) >= data_start + length || \
                                 ((int64_t)gf->nodes[j]->data) + (int64_t) ggml_nbytes(gf->nodes[j]) <= data_start) {
                                 continue;
-                            } else {
-                                exe_flag = 0;
                             }
+
+                            exe_flag = 0;
                         }
                     }
                 }
@@ -435,11 +454,13 @@ void ggml_metal_graph_find_concurrency(
         ctx->concur_list[level_pos + concurrency] = -1;
         ctx->concur_list_len++;
         // jump all sorted nodes at nodes_bak
-        while (!nodes_unused[n_start]) {n_start++;}
+        while (!nodes_unused[n_start]) {
+            n_start++;
+        }
         level_pos += concurrency + 1;
     }
 
-    if (ctx->concur_list_len > GGML_MAX_NODES) {
+    if (ctx->concur_list_len > GGML_MAX_CONCUR) {
         fprintf(stderr, "%s: too many elements for metal ctx->concur_list!\n", __func__);
     }
 }
@@ -453,7 +474,7 @@ void ggml_metal_graph_compute(
     // else fallback to serial dispatch
     MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
 
-    const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_NODES;
+    const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_CONCUR;
 
     const int n_nodes  = has_concur ? ctx->concur_list_len      : gf->n_nodes;
     edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial;

From 3d9a55181603e85a26378a850a14068034e5002d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Mon, 7 Aug 2023 10:09:40 +0200
Subject: [PATCH 38/51] Fixed mmap prefetch for GPU offloading (#2529)

---
 llama-util.h | 2 +-
 llama.cpp    | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llama-util.h b/llama-util.h
index 3fc03ce28273e..6e9e39ddb6f58 100644
--- a/llama-util.h
+++ b/llama-util.h
@@ -219,7 +219,7 @@ struct llama_mmap {
         // prefetch/readahead impairs performance on NUMA systems
         if (numa) { prefetch = 0; }
 #ifdef __linux__
-        if (prefetch) { flags |= MAP_POPULATE; }
+        if (prefetch >= file->size) { flags |= MAP_POPULATE; }
 #endif
         addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
         if (addr == MAP_FAILED) {
diff --git a/llama.cpp b/llama.cpp
index 839739870eb3e..39aefd499dd0c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -747,12 +747,12 @@ struct llama_model_loader {
 
     void load_all_data(llama_progress_callback progress_callback, void *  progress_callback_user_data, llama_mlock * lmlock) {
         size_t data_size = 0;
-        size_t prefetch_size = 0;
+        size_t prefetch_size = file_loader->file.size;
         size_t lock_size = 0;
         for (const llama_load_tensor & lt : tensors_map.tensors) {
             data_size += lt.size;
-            if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
-                prefetch_size += lt.size;
+            if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
+                prefetch_size -= lt.size;
             }
         }
 

From 99d29c0094476c4962023036ecd61a3309d0e16b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 7 Aug 2023 13:20:09 +0300
Subject: [PATCH 39/51] ggml : sync (custom ops) (#2537)

ggml-ci
---
 ggml.c | 379 +++++++++++++++++++++++++++++++++++++++++++++------------
 ggml.h | 145 +++++++++++++++++-----
 2 files changed, 413 insertions(+), 111 deletions(-)

diff --git a/ggml.c b/ggml.c
index fa0f98aa09df2..b4a36524021f9 100644
--- a/ggml.c
+++ b/ggml.c
@@ -195,8 +195,8 @@ typedef void * thread_ret_t;
 #define GGML_ALIGNED_MALLOC(size)  _aligned_malloc(size, GGML_MEM_ALIGN)
 #define GGML_ALIGNED_FREE(ptr)     _aligned_free(ptr)
 #else
-inline static void* ggml_aligned_malloc(size_t size) {
-    void* aligned_memory = NULL;
+inline static void * ggml_aligned_malloc(size_t size) {
+    void * aligned_memory = NULL;
 #ifdef GGML_USE_METAL
     int result = posix_memalign(&aligned_memory, getpagesize(), size);
 #else
@@ -3811,7 +3811,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "CROSS_ENTROPY_LOSS_BACK",
 };
 
-static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59");
+static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -3883,7 +3883,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "cross_entropy_loss_back(x,y)",
 };
 
-static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59");
+static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -4253,7 +4253,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
         tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
 }
 
-static inline bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
+bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
     return
@@ -6890,7 +6890,7 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
         ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
         a->ne[2], 1, 1,
     };
-    struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
 
     int32_t params[] = { s0, p0, d0 };
     ggml_set_op_params(result, &params, sizeof(params));
@@ -6905,10 +6905,10 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
 
 // ggml_conv_2d
 
-struct ggml_tensor* ggml_conv_2d(
-    struct ggml_context* ctx,
-    struct ggml_tensor * a,
-    struct ggml_tensor * b,
+struct ggml_tensor * ggml_conv_2d(
+    struct ggml_context * ctx,
+    struct ggml_tensor  * a,
+    struct ggml_tensor  * b,
     int                  s0,
     int                  s1,
     int                  p0,
@@ -6929,7 +6929,7 @@ struct ggml_tensor* ggml_conv_2d(
         ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
         a->ne[3], b->ne[3],
     };
-    struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
 
     int32_t params[] = { s0, s1, p0, p1, d0, d1 };
     ggml_set_op_params(result, &params, sizeof(params));
@@ -6945,7 +6945,7 @@ struct ggml_tensor* ggml_conv_2d(
 
 // ggml_conv_1d_ph
 
-struct ggml_tensor* ggml_conv_1d_ph(
+struct ggml_tensor * ggml_conv_1d_ph(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b,
@@ -6963,7 +6963,7 @@ static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
 
 // ggml_pool_1d
 
-struct ggml_tensor* ggml_pool_1d(
+struct ggml_tensor * ggml_pool_1d(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         enum ggml_op_pool     op,
@@ -6982,7 +6982,7 @@ struct ggml_tensor* ggml_pool_1d(
         ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
         a->ne[1],
     };
-    struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
 
     int32_t params[] = { op, k0, s0, p0 };
     ggml_set_op_params(result, &params, sizeof(params));
@@ -6996,7 +6996,7 @@ struct ggml_tensor* ggml_pool_1d(
 
 // ggml_pool_2d
 
-struct ggml_tensor* ggml_pool_2d(
+struct ggml_tensor * ggml_pool_2d(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         enum ggml_op_pool     op,
@@ -7019,7 +7019,7 @@ struct ggml_tensor* ggml_pool_2d(
         ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
         a->ne[2],
     };
-    struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
 
     int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
     ggml_set_op_params(result, &params, sizeof(params));
@@ -7349,7 +7349,7 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
     return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
 }
 
-// ggml_map_custom1
+// ggml_map_custom1_f32
 
 static struct ggml_tensor * ggml_map_custom1_impl_f32(
         struct ggml_context          * ctx,
@@ -7366,7 +7366,7 @@ static struct ggml_tensor * ggml_map_custom1_impl_f32(
 
     ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
 
-    result->op = GGML_OP_MAP_CUSTOM1;
+    result->op = GGML_OP_MAP_CUSTOM1_F32;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src[0] = a;
 
@@ -7387,7 +7387,7 @@ struct ggml_tensor * ggml_map_custom1_inplace_f32(
     return ggml_map_custom1_impl_f32(ctx, a, fun, true);
 }
 
-// ggml_map_custom2
+// ggml_map_custom2_f32
 
 static struct ggml_tensor * ggml_map_custom2_impl_f32(
         struct ggml_context          * ctx,
@@ -7405,7 +7405,7 @@ static struct ggml_tensor * ggml_map_custom2_impl_f32(
 
     ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
 
-    result->op = GGML_OP_MAP_CUSTOM2;
+    result->op = GGML_OP_MAP_CUSTOM2_F32;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src[0] = a;
     result->src[1] = b;
@@ -7429,7 +7429,7 @@ struct ggml_tensor * ggml_map_custom2_inplace_f32(
     return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
 }
 
-// ggml_map_custom3
+// ggml_map_custom3_f32
 
 static struct ggml_tensor * ggml_map_custom3_impl_f32(
         struct ggml_context          * ctx,
@@ -7448,7 +7448,7 @@ static struct ggml_tensor * ggml_map_custom3_impl_f32(
 
     ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
 
-    result->op = GGML_OP_MAP_CUSTOM3;
+    result->op = GGML_OP_MAP_CUSTOM3_F32;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src[0] = a;
     result->src[1] = b;
@@ -7475,6 +7475,190 @@ struct ggml_tensor * ggml_map_custom3_inplace_f32(
     return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
 }
 
+// ggml_map_custom1
+struct ggml_map_custom1_op_params {
+    ggml_custom1_op_t fun;
+    int n_tasks;
+    void * userdata;
+};
+
+static struct ggml_tensor * ggml_map_custom1_impl(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        const  ggml_custom1_op_t       fun,
+        int                            n_tasks,
+        void                         * userdata,
+        bool                           inplace) {
+    GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
+
+    bool is_node = false;
+
+    if (!inplace && a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    struct ggml_map_custom1_op_params params = {
+        /*.fun      =*/ fun,
+        /*.n_tasks  =*/ n_tasks,
+        /*.userdata =*/ userdata
+    };
+    ggml_set_op_params(result, (const void *) &params, sizeof(params));
+
+    result->op = GGML_OP_MAP_CUSTOM1;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_map_custom1(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        const  ggml_custom1_op_t       fun,
+        int                            n_tasks,
+        void                         * userdata) {
+    return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, false);
+}
+
+struct ggml_tensor * ggml_map_custom1_inplace(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        const  ggml_custom1_op_t       fun,
+        int                            n_tasks,
+        void                         * userdata) {
+    return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, true);
+}
+
+// ggml_map_custom2
+
+struct ggml_map_custom2_op_params {
+    ggml_custom2_op_t fun;
+    int n_tasks;
+    void * userdata;
+};
+
+static struct ggml_tensor * ggml_map_custom2_impl(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        const  ggml_custom2_op_t       fun,
+        int                            n_tasks,
+        void                         * userdata,
+        bool                           inplace) {
+    GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
+
+    bool is_node = false;
+
+    if (!inplace && (a->grad || b->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    struct ggml_map_custom2_op_params params = {
+        /*.fun      =*/ fun,
+        /*.n_tasks  =*/ n_tasks,
+        /*.userdata =*/ userdata
+    };
+    ggml_set_op_params(result, (const void *) &params, sizeof(params));
+
+    result->op = GGML_OP_MAP_CUSTOM2;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_map_custom2(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        const  ggml_custom2_op_t       fun,
+        int                            n_tasks,
+        void                         * userdata) {
+    return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, false);
+}
+
+struct ggml_tensor * ggml_map_custom2_inplace(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        const  ggml_custom2_op_t       fun,
+        int                            n_tasks,
+        void                         * userdata) {
+    return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, true);
+}
+
+// ggml_map_custom3
+
+struct ggml_map_custom3_op_params {
+    ggml_custom3_op_t fun;
+    int n_tasks;
+    void * userdata;
+};
+
+static struct ggml_tensor * ggml_map_custom3_impl(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        struct ggml_tensor           * c,
+        const  ggml_custom3_op_t       fun,
+        int                            n_tasks,
+        void                         * userdata,
+        bool                           inplace) {
+    GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
+
+    bool is_node = false;
+
+    if (!inplace && (a->grad || b->grad || c->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    struct ggml_map_custom3_op_params params = {
+        /*.fun      =*/ fun,
+        /*.n_tasks  =*/ n_tasks,
+        /*.userdata =*/ userdata
+    };
+    ggml_set_op_params(result, (const void *) &params, sizeof(params));
+
+    result->op = GGML_OP_MAP_CUSTOM3;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_map_custom3(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        struct ggml_tensor           * c,
+        const  ggml_custom3_op_t       fun,
+        int                            n_tasks,
+        void                         * userdata) {
+    return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, false);
+}
+
+struct ggml_tensor * ggml_map_custom3_inplace(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        struct ggml_tensor           * c,
+        const  ggml_custom3_op_t       fun,
+        int                            n_tasks,
+        void                         * userdata) {
+    return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
+}
+
+
+
 // ggml_cross_entropy_loss
 
 struct ggml_tensor * ggml_cross_entropy_loss(
@@ -9283,8 +9467,8 @@ static void ggml_compute_forward_sum_rows_f32(
     for (int64_t i3 = 0; i3 < ne03; i3++) {
         for (int64_t i2 = 0; i2 < ne02; i2++) {
             for (int64_t i1 = 0; i1 < ne01; i1++) {
-                float* src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
-                float* dst_row = (float *) ((char *) dst->data  + i1*nb1  + i2*nb2  + i3*nb3);
+                float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
+                float * dst_row = (float *) ((char *) dst->data  + i1*nb1  + i2*nb2  + i3*nb3);
                 float row_sum = 0;
                 ggml_vec_sum_f32(ne00, &row_sum, src_row);
                 dst_row[0] = row_sum;
@@ -12894,7 +13078,7 @@ static void ggml_compute_forward_pool_1d(
         const struct ggml_tensor * src0,
               struct ggml_tensor * dst) {
 
-    const int32_t* opts = (const int32_t*)dst->op_params;
+    const int32_t * opts = (const int32_t *)dst->op_params;
     enum ggml_op_pool op = opts[0];
     const int k0 = opts[1];
     const int s0 = opts[2];
@@ -14227,24 +14411,6 @@ static void ggml_compute_forward_map_custom1_f32(
     fun(dst, a);
 }
 
-
-static void ggml_compute_forward_map_custom1(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * a,
-        struct ggml_tensor * dst,
-        const ggml_custom1_op_f32_t fun) {
-    switch (a->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_map_custom1_f32(params, a, dst, fun);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
 // ggml_compute_forward_map_custom2
 
 static void ggml_compute_forward_map_custom2_f32(
@@ -14263,24 +14429,6 @@ static void ggml_compute_forward_map_custom2_f32(
 }
 
 
-static void ggml_compute_forward_map_custom2(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * a,
-        const struct ggml_tensor * b,
-        struct ggml_tensor * dst,
-        const ggml_custom2_op_f32_t fun) {
-    switch (a->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_map_custom2_f32(params, a, b, dst, fun);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
 // ggml_compute_forward_map_custom3
 
 static void ggml_compute_forward_map_custom3_f32(
@@ -14299,24 +14447,52 @@ static void ggml_compute_forward_map_custom3_f32(
     fun(dst, a, b, c);
 }
 
+// ggml_compute_forward_map_custom1
+
+static void ggml_compute_forward_map_custom1(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * a,
+              struct ggml_tensor * dst) {
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) dst->op_params;
+
+    p->fun(dst, a, params->ith, params->nth, p->userdata);
+}
+
+// ggml_compute_forward_map_custom2
+
+static void ggml_compute_forward_map_custom2(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * a,
+        const struct ggml_tensor * b,
+              struct ggml_tensor * dst) {
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) dst->op_params;
+
+    p->fun(dst, a, b, params->ith, params->nth, p->userdata);
+}
+
+// ggml_compute_forward_map_custom3
 
 static void ggml_compute_forward_map_custom3(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * a,
         const struct ggml_tensor * b,
         const struct ggml_tensor * c,
-        struct ggml_tensor * dst,
-        const ggml_custom3_op_f32_t fun) {
-    switch (a->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_map_custom3_f32(params, a, b, c, dst, fun);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
+              struct ggml_tensor * dst) {
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
     }
+
+    struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) dst->op_params;
+
+    p->fun(dst, a, b, c, params->ith, params->nth, p->userdata);
 }
 
 // ggml_compute_forward_cross_entropy_loss
@@ -14838,25 +15014,40 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
                 ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
             }
             break;
-        case GGML_OP_MAP_CUSTOM1:
+        case GGML_OP_MAP_CUSTOM1_F32:
             {
                 ggml_custom1_op_f32_t fun;
                 memcpy(&fun, tensor->op_params, sizeof(fun));
-                ggml_compute_forward_map_custom1(params, tensor->src[0], tensor, fun);
+                ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun);
             }
             break;
-        case GGML_OP_MAP_CUSTOM2:
+        case GGML_OP_MAP_CUSTOM2_F32:
             {
                 ggml_custom2_op_f32_t fun;
                 memcpy(&fun, tensor->op_params, sizeof(fun));
-                ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor, fun);
+                ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun);
             }
             break;
-        case GGML_OP_MAP_CUSTOM3:
+        case GGML_OP_MAP_CUSTOM3_F32:
             {
                 ggml_custom3_op_f32_t fun;
                 memcpy(&fun, tensor->op_params, sizeof(fun));
-                ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
+                ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
+            }
+            break;
+        case GGML_OP_MAP_CUSTOM1:
+            {
+                ggml_compute_forward_map_custom1(params, tensor->src[0], tensor);
+            }
+            break;
+        case GGML_OP_MAP_CUSTOM2:
+            {
+                ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor);
+            }
+            break;
+        case GGML_OP_MAP_CUSTOM3:
+            {
+                ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
             }
             break;
         case GGML_OP_CROSS_ENTROPY_LOSS:
@@ -15664,6 +15855,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             } break;
         case GGML_OP_MAP_UNARY:
         case GGML_OP_MAP_BINARY:
+        case GGML_OP_MAP_CUSTOM1_F32:
+        case GGML_OP_MAP_CUSTOM2_F32:
+        case GGML_OP_MAP_CUSTOM3_F32:
         case GGML_OP_MAP_CUSTOM1:
         case GGML_OP_MAP_CUSTOM2:
         case GGML_OP_MAP_CUSTOM3:
@@ -16449,11 +16643,38 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
             case GGML_OP_WIN_UNPART:
             case GGML_OP_MAP_UNARY:
             case GGML_OP_MAP_BINARY:
+            case GGML_OP_MAP_CUSTOM1_F32:
+            case GGML_OP_MAP_CUSTOM2_F32:
+            case GGML_OP_MAP_CUSTOM3_F32:
+                {
+                    n_tasks = 1;
+                } break;
             case GGML_OP_MAP_CUSTOM1:
+                {
+                    struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
+                    if (p->n_tasks == GGML_N_TASKS_MAX) {
+                        n_tasks = n_threads;
+                    } else {
+                        n_tasks = MIN(p->n_tasks, n_threads);
+                    }
+                } break;
             case GGML_OP_MAP_CUSTOM2:
+                {
+                    struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
+                    if (p->n_tasks == GGML_N_TASKS_MAX) {
+                        n_tasks = n_threads;
+                    } else {
+                        n_tasks = MIN(p->n_tasks, n_threads);
+                    }
+                } break;
             case GGML_OP_MAP_CUSTOM3:
                 {
-                    n_tasks = 1;
+                    struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
+                    if (p->n_tasks == GGML_N_TASKS_MAX) {
+                        n_tasks = n_threads;
+                    } else {
+                        n_tasks = MIN(p->n_tasks, n_threads);
+                    }
                 } break;
             case GGML_OP_CROSS_ENTROPY_LOSS:
                 {
diff --git a/ggml.h b/ggml.h
index aba92480c833c..bdbd128004332 100644
--- a/ggml.h
+++ b/ggml.h
@@ -183,6 +183,15 @@
 #    define GGML_API
 #endif
 
+// TODO: support for clang
+#ifdef __GNUC__
+#    define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
+#elif defined(_MSC_VER)
+#    define GGML_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
+#else
+#    define GGML_DEPRECATED(func, hint) func
+#endif
+
 #include <stdint.h>
 #include <stddef.h>
 #include <stdbool.h>
@@ -374,6 +383,10 @@ extern "C" {
         GGML_OP_MAP_UNARY,
         GGML_OP_MAP_BINARY,
 
+        GGML_OP_MAP_CUSTOM1_F32,
+        GGML_OP_MAP_CUSTOM2_F32,
+        GGML_OP_MAP_CUSTOM3_F32,
+
         GGML_OP_MAP_CUSTOM1,
         GGML_OP_MAP_CUSTOM2,
         GGML_OP_MAP_CUSTOM3,
@@ -570,6 +583,8 @@ extern "C" {
     GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
     GGML_API bool ggml_is_permuted  (const struct ggml_tensor * tensor);
 
+    GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
+
     // use this to compute the memory overhead of a tensor
     GGML_API size_t ggml_tensor_overhead(void);
 
@@ -1240,7 +1255,7 @@ extern "C" {
 
     // conv_1d with padding = half
     // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
-    GGML_API struct ggml_tensor* ggml_conv_1d_ph(
+    GGML_API struct ggml_tensor * ggml_conv_1d_ph(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             struct ggml_tensor  * b,
@@ -1253,7 +1268,7 @@ extern "C" {
         GGML_OP_POOL_COUNT,
     };
 
-    GGML_API struct ggml_tensor* ggml_pool_1d(
+    GGML_API struct ggml_tensor * ggml_pool_1d(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             enum ggml_op_pool     op,
@@ -1261,7 +1276,7 @@ extern "C" {
             int                   s0, // stride
             int                   p0); // padding
 
-    GGML_API struct ggml_tensor* ggml_pool_2d(
+    GGML_API struct ggml_tensor * ggml_pool_2d(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             enum ggml_op_pool     op,
@@ -1315,15 +1330,6 @@ extern "C" {
             int                   h0,
             int                   w);
 
-    // custom operators
-
-    typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
-    typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
-
-    typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
-    typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
-    typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
-
     GGML_API struct ggml_tensor * ggml_unary(
             struct ggml_context * ctx,
              struct ggml_tensor * a,
@@ -1334,63 +1340,138 @@ extern "C" {
         struct ggml_tensor  * a,
         enum ggml_unary_op op);
 
-    GGML_API struct ggml_tensor * ggml_map_unary_f32(
+    // custom operators
+
+    typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
+    typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
+
+    typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
+    typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
+    typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32(
             struct ggml_context        * ctx,
             struct ggml_tensor         * a,
-                   ggml_unary_op_f32_t   fun);
+                   ggml_unary_op_f32_t   fun),
+        "use ggml_map_custom1 instead");
 
-    GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
             struct ggml_context        * ctx,
             struct ggml_tensor         * a,
-                   ggml_unary_op_f32_t   fun);
+                   ggml_unary_op_f32_t   fun),
+        "use ggml_map_custom1_inplace instead");
 
-    GGML_API struct ggml_tensor * ggml_map_binary_f32(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32(
             struct ggml_context         * ctx,
             struct ggml_tensor          * a,
             struct ggml_tensor          * b,
-                   ggml_binary_op_f32_t   fun);
+                   ggml_binary_op_f32_t   fun),
+        "use ggml_map_custom2 instead");
 
-    GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
             struct ggml_context         * ctx,
             struct ggml_tensor          * a,
             struct ggml_tensor          * b,
-                   ggml_binary_op_f32_t   fun);
+                   ggml_binary_op_f32_t   fun),
+        "use ggml_map_custom2_inplace instead");
 
-    GGML_API struct ggml_tensor * ggml_map_custom1_f32(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32(
             struct ggml_context          * ctx,
             struct ggml_tensor           * a,
-                   ggml_custom1_op_f32_t   fun);
+                   ggml_custom1_op_f32_t   fun),
+        "use ggml_map_custom1 instead");
 
-    GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
             struct ggml_context          * ctx,
             struct ggml_tensor           * a,
-                   ggml_custom1_op_f32_t   fun);
+                   ggml_custom1_op_f32_t   fun),
+        "use ggml_map_custom1_inplace instead");
 
-    GGML_API struct ggml_tensor * ggml_map_custom2_f32(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32(
             struct ggml_context          * ctx,
             struct ggml_tensor           * a,
             struct ggml_tensor           * b,
-                   ggml_custom2_op_f32_t   fun);
+                   ggml_custom2_op_f32_t   fun),
+        "use ggml_map_custom2 instead");
 
-    GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
             struct ggml_context          * ctx,
             struct ggml_tensor           * a,
             struct ggml_tensor           * b,
-                   ggml_custom2_op_f32_t   fun);
+                   ggml_custom2_op_f32_t   fun),
+        "use ggml_map_custom2_inplace instead");
 
-    GGML_API struct ggml_tensor * ggml_map_custom3_f32(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32(
             struct ggml_context          * ctx,
             struct ggml_tensor           * a,
             struct ggml_tensor           * b,
             struct ggml_tensor           * c,
-                   ggml_custom3_op_f32_t   fun);
+                   ggml_custom3_op_f32_t   fun),
+        "use ggml_map_custom3 instead");
 
-    GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
             struct ggml_context          * ctx,
             struct ggml_tensor           * a,
             struct ggml_tensor           * b,
             struct ggml_tensor           * c,
-                   ggml_custom3_op_f32_t   fun);
+                   ggml_custom3_op_f32_t   fun),
+        "use ggml_map_custom3_inplace instead");
+
+    // custom operators v2
+
+    typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
+    typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
+    typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
+
+    #define GGML_N_TASKS_MAX -1
+
+    GGML_API struct ggml_tensor * ggml_map_custom1(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            ggml_custom1_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_API struct ggml_tensor * ggml_map_custom1_inplace(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            ggml_custom1_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_API struct ggml_tensor * ggml_map_custom2(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            struct ggml_tensor    * b,
+            ggml_custom2_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_API struct ggml_tensor * ggml_map_custom2_inplace(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            struct ggml_tensor    * b,
+            ggml_custom2_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_API struct ggml_tensor * ggml_map_custom3(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            struct ggml_tensor    * b,
+            struct ggml_tensor    * c,
+            ggml_custom3_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_API struct ggml_tensor * ggml_map_custom3_inplace(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            struct ggml_tensor    * b,
+            struct ggml_tensor    * c,
+            ggml_custom3_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
 
     // loss function
 

From 9082b5dfbfae01243a0b822dcd2812877e63bf1b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 7 Aug 2023 13:55:18 +0300
Subject: [PATCH 40/51] ggml : change params pointer (style change) (#2539)

ggml-ci
---
 ggml.c | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/ggml.c b/ggml.c
index b4a36524021f9..9c4b49db86be9 100644
--- a/ggml.c
+++ b/ggml.c
@@ -4602,7 +4602,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
         /*.ne           =*/ { 1, 1, 1, 1 },
         /*.nb           =*/ { 0, 0, 0, 0 },
         /*.op           =*/ GGML_OP_NONE,
-        /*.op_params    =*/ {0},
+        /*.op_params    =*/ { 0 },
         /*.is_param     =*/ false,
         /*.grad         =*/ NULL,
         /*.src          =*/ { NULL },
@@ -4634,6 +4634,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
 }
 
 static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
+    GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
     assert(params_size <= GGML_MAX_OP_PARAMS);
     memcpy(tensor->op_params, params, params_size);
 }
@@ -6439,7 +6440,7 @@ struct ggml_tensor * ggml_permute(
     result->src[0] = a;
 
     int32_t params[] = { axis0, axis1, axis2, axis3 };
-    ggml_set_op_params(result, &params, sizeof(params));
+    ggml_set_op_params(result, params, sizeof(params));
 
     return result;
 }
@@ -6565,7 +6566,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl(
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
     int32_t params[] = { n_past, inplace ? 1 : 0 };
-    ggml_set_op_params(result, &params, sizeof(params));
+    ggml_set_op_params(result, params, sizeof(params));
 
     result->op   = GGML_OP_DIAG_MASK_INF;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6605,7 +6606,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl(
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
     int32_t params[] = { n_past, inplace ? 1 : 0 };
-    ggml_set_op_params(result, &params, sizeof(params));
+    ggml_set_op_params(result, params, sizeof(params));
 
     result->op   = GGML_OP_DIAG_MASK_ZERO;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6721,9 +6722,9 @@ static struct ggml_tensor * ggml_rope_impl(
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
     int32_t params[6] = { n_past, n_dims, mode, n_ctx };
-    memcpy(params + 4, &freq_base, sizeof(float));
+    memcpy(params + 4, &freq_base,  sizeof(float));
     memcpy(params + 5, &freq_scale, sizeof(float));
-    ggml_set_op_params(result, &params, sizeof(params));
+    ggml_set_op_params(result, params, sizeof(params));
 
     result->op   = GGML_OP_ROPE;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6797,7 +6798,7 @@ struct ggml_tensor * ggml_rope_back(
     struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
 
     int32_t params[] = { n_past, n_dims, mode, n_ctx };
-    ggml_set_op_params(result, &params, sizeof(params));
+    ggml_set_op_params(result, params, sizeof(params));
 
     result->op   = GGML_OP_ROPE_BACK;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6828,7 +6829,7 @@ struct ggml_tensor * ggml_alibi(
 
     int32_t op_params[3] = { n_past, n_head };
     memcpy(op_params + 2, &bias_max, sizeof(float));
-    ggml_set_op_params(result, &op_params, sizeof(op_params));
+    ggml_set_op_params(result, op_params, sizeof(op_params));
 
     result->op   = GGML_OP_ALIBI;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6855,7 +6856,7 @@ struct ggml_tensor * ggml_clamp(
     struct ggml_tensor * result = ggml_view_tensor(ctx, a);
 
     float params[] = { min, max };
-    ggml_set_op_params(result, &params, sizeof(params));
+    ggml_set_op_params(result, params, sizeof(params));
 
     result->op   = GGML_OP_CLAMP;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6893,7 +6894,7 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
 
     int32_t params[] = { s0, p0, d0 };
-    ggml_set_op_params(result, &params, sizeof(params));
+    ggml_set_op_params(result, params, sizeof(params));
 
     result->op = GGML_OP_CONV_1D;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6932,7 +6933,7 @@ struct ggml_tensor * ggml_conv_2d(
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
 
     int32_t params[] = { s0, s1, p0, p1, d0, d1 };
-    ggml_set_op_params(result, &params, sizeof(params));
+    ggml_set_op_params(result, params, sizeof(params));
 
     result->op = GGML_OP_CONV_2D;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6985,7 +6986,7 @@ struct ggml_tensor * ggml_pool_1d(
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
 
     int32_t params[] = { op, k0, s0, p0 };
-    ggml_set_op_params(result, &params, sizeof(params));
+    ggml_set_op_params(result, params, sizeof(params));
 
     result->op = GGML_OP_POOL_1D;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7022,7 +7023,7 @@ struct ggml_tensor * ggml_pool_2d(
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
 
     int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
-    ggml_set_op_params(result, &params, sizeof(params));
+    ggml_set_op_params(result, params, sizeof(params));
 
     result->op = GGML_OP_POOL_2D;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7190,7 +7191,7 @@ struct ggml_tensor * ggml_win_part(
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
 
     int32_t params[] = { npx, npy, w };
-    ggml_set_op_params(result, &params, sizeof(params));
+    ggml_set_op_params(result, params, sizeof(params));
 
     result->op   = GGML_OP_WIN_PART;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7220,7 +7221,7 @@ struct ggml_tensor * ggml_win_unpart(
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
 
     int32_t params[] = { w };
-    ggml_set_op_params(result, &params, sizeof(params));
+    ggml_set_op_params(result, params, sizeof(params));
 
     result->op   = GGML_OP_WIN_UNPART;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;

From 60baff7c8584ec369e53469cad5f92e102b1efe4 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 7 Aug 2023 14:24:42 +0300
Subject: [PATCH 41/51] ggml : pad result of ggml_nbytes()

---
 ggml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index 9c4b49db86be9..c97c2b63f346b 100644
--- a/ggml.c
+++ b/ggml.c
@@ -4110,7 +4110,7 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
     //
     // is enough, but just in case, adding the second part
 
-    return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
+    return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]), GGML_MEM_ALIGN);
 }
 
 size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {

From 93356bdb7a324a8f6570f99d02af392cd4c45796 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 7 Aug 2023 14:25:58 +0300
Subject: [PATCH 42/51] ggml : mul mat tweaks (#2372)

* ggml : mul mat wip

ggml-ci

* ggml : alternative thread distribution for mul_mat

ggml-ci

* ggml : mul_mat block tiling attempt

* ggml : mul_mat threads yield

ggml-ci
---
 ggml.c | 134 ++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 79 insertions(+), 55 deletions(-)

diff --git a/ggml.c b/ggml.c
index c97c2b63f346b..beb7f464167d5 100644
--- a/ggml.c
+++ b/ggml.c
@@ -10731,71 +10731,95 @@ static void ggml_compute_forward_mul_mat(
         return;
     }
 
-    // parallelize by src0 rows
-    const int64_t dr = (ne01 + nth - 1)/nth;
+    const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+    const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
 
-    const int64_t ir10 = dr*ith;
-    const int64_t ir11 = MIN(ir10 + dr, ne01);
+    const int64_t nr0 = ne01;           // src0 rows
+    const int64_t nr1 = ne11*ne12*ne13; // src1 rows
 
-    // src1 rows
-    const int64_t nr1 = ne11*ne12*ne13;
+    //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
 
-    const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
-    const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
+    // distribute the thread work across the inner or outer loop based on which one is larger
 
-    for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
-        const int64_t i13 = (ir1/(ne12*ne11));
-        const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
-        const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
-
-        const int64_t ir0 = (ir1/ne11)%(ne02*ne03);
-        const int64_t i03 = (ir0/(ne02));
-        // Hack for "Falcon multi-query-attention key stutter" / alternative to ggml_repeat2.
-        // See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470:
-        // GG: this is likely the correct way to broadcast, though need some more thought
-        //     therefore leaving the comments to remind us for now
-        const int64_t i02 = (i12 / (ne12 / ne02));
-        // Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon)
-        // const int64_t i02 = (ir0 - i03*ne02);
-
-        const int64_t i1 = i11;
-        const int64_t i2 = i12;
-        const int64_t i3 = i13;
-
-        const char * src0_row = (const char *) src0->data + (  0 + i02*nb02 + i03*nb03     );
-
-        // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
-        //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
-        //       the original src1 data pointer, so we should index using the indices directly
-        // TODO: this is a bit of a hack, we should probably have a better way to handle this
-        const char * src1_col = (const char *) wdata +
-            (src1_cont || src1->type != vec_dot_type
-             ? (i11      + i12*ne11 + i13*ne12*ne11)*row_size
-             : (i11*nb11 + i12*nb12 + i13*nb13));
-
-        float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
-
-        for (int64_t ir = ir10; ir < ir11; ++ir) {
-            vec_dot(ne00, &dst_col[ir], src0_row + ir*nb01, src1_col);
-        }
+    const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
+    const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
+
+    const int64_t ith0 = ith % nth0;
+    const int64_t ith1 = ith / nth0;
+
+    const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
+    const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
+
+    const int64_t ir010 = dr0*ith0;
+    const int64_t ir011 = MIN(ir010 + dr0, nr0);
+
+    const int64_t ir110 = dr1*ith1;
+    const int64_t ir111 = MIN(ir110 + dr1, nr1);
+
+    //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
+
+    // threads with no work simply yield (not sure if it helps)
+    if (ir010 >= ir011 || ir110 >= ir111) {
+        sched_yield();
+        return;
     }
 
-    //int64_t t1 = ggml_time_us();
-    //static int64_t acc = 0;
-    //acc += t1 - t0;
-    //if (t1 - t0 > 10) {
-    //    printf("\n");
-    //    printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
-    //    printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
-    //    printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
+    assert(ne12 % ne02 == 0);
+    assert(ne13 % ne03 == 0);
 
-    //    printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
-    //}
-}
+    // broadcast factors
+    const int64_t r2 = ne12/ne02;
+    const int64_t r3 = ne13/ne03;
 
+    // block-tiling attempt
+    const int64_t blck_0 = 16;
+    const int64_t blck_1 = 16;
 
-// ggml_compute_forward_out_prod
+    // attempt to reduce false-sharing (does not seem to make a difference)
+    float tmp[16];
+
+    for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
+        for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
+            for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
+                const int64_t i13 = (ir1/(ne12*ne11));
+                const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
+                const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
+
+                // broadcast src0 into src1
+                const int64_t i03 = i13/r3;
+                const int64_t i02 = i12/r2;
+
+                const int64_t i1 = i11;
+                const int64_t i2 = i12;
+                const int64_t i3 = i13;
+
+                const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03);
+
+                // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
+                //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
+                //       the original src1 data pointer, so we should index using the indices directly
+                // TODO: this is a bit of a hack, we should probably have a better way to handle this
+                const char * src1_col = (const char *) wdata +
+                    (src1_cont || src1->type != vec_dot_type
+                     ? (i11      + i12*ne11 + i13*ne12*ne11)*row_size
+                     : (i11*nb11 + i12*nb12 + i13*nb13));
 
+                float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
+
+                //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
+                //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
+                //}
+
+                for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
+                    vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
+                }
+                memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
+            }
+        }
+    }
+}
+
+// ggml_compute_forward_out_prod
 
 static void ggml_compute_forward_out_prod_f32(
         const struct ggml_compute_params * params,

From f3c3b4b1672d860800639c87d3b5d17564692469 Mon Sep 17 00:00:00 2001
From: klosax <131523366+klosax@users.noreply.github.com>
Date: Mon, 7 Aug 2023 19:07:19 +0200
Subject: [PATCH 43/51] Add --rope-scale parameter (#2544)

* common.cpp : Add --rope-scale parameter
* README.md : Add info about using linear rope scaling
---
 examples/common.cpp     | 11 +++++++++--
 examples/main/README.md |  6 ++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/examples/common.cpp b/examples/common.cpp
index 21f4a0357d422..4d3ba9bb23ab4 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -194,6 +194,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.rope_freq_scale = std::stof(argv[i]);
+        } else if (arg == "--rope-scale") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.rope_freq_scale = 1.0f/std::stof(argv[i]);
         } else if (arg == "--memory-f32") {
             params.memory_f16 = false;
         } else if (arg == "--top-p") {
@@ -564,8 +570,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stdout, "  --cfg-negative-prompt PROMPT \n");
     fprintf(stdout, "                        negative prompt to use for guidance. (default: empty)\n");
     fprintf(stdout, "  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
-    fprintf(stdout, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
-    fprintf(stdout, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
+    fprintf(stdout, "  --rope-scale N        RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
+    fprintf(stdout, "  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
+    fprintf(stdout, "  --rope-freq-scale N   RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
     fprintf(stdout, "  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
     fprintf(stdout, "  --no-penalize-nl      do not penalize newline token\n");
     fprintf(stdout, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
diff --git a/examples/main/README.md b/examples/main/README.md
index 014112e5db484..55c16096f03b7 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -140,6 +140,12 @@ The `--ctx-size` option allows you to set the size of the prompt context used by
 
 -   `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
 
+### Extended Context Size
+
+Some fine-tuned models have extened the context length by scaling RoPE. For example, if the original pretrained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
+
+- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
+
 ### Keep Prompt
 
 The `--keep` option allows users to retain the original prompt when the model runs out of context, ensuring a connection to the initial instruction or conversation topic is maintained.

From 2d7baaf50f3277e65cf71071f61ea34823d14c30 Mon Sep 17 00:00:00 2001
From: AustinMroz <austinmroz@utexas.edu>
Date: Tue, 8 Aug 2023 06:44:48 -0500
Subject: [PATCH 44/51] vim : streaming and more (#2495)

* Update Vim plugin

* Remove getbufoneline usage, Add input bind example.

getbufoneline() appears to be a recently added function and has been
replaced with getbufline for compatibility.

An additional example that explains how to add a keybind that works in
insert mode was added.
---
 examples/llama.vim | 132 +++++++++++++++++++++++++++++++++++++++++++++
 examples/llm.vim   |  23 --------
 2 files changed, 132 insertions(+), 23 deletions(-)
 create mode 100644 examples/llama.vim
 delete mode 100644 examples/llm.vim

diff --git a/examples/llama.vim b/examples/llama.vim
new file mode 100644
index 0000000000000..f03fadfb7a017
--- /dev/null
+++ b/examples/llama.vim
@@ -0,0 +1,132 @@
+" Requires an already running llama.cpp server
+" To install either copy or symlink to ~/.vim/autoload/llama.vim
+" Then start with either :call llama#doLlamaGen(),
+" or add a keybind to your vimrc such as
+" nnoremap Z :call llama#doLlamaGen()<CR>
+" Similarly, you could add an insert mode keybind with
+" inoremap <C-B> <Cmd>call llama#doLlamaGen()<CR>
+"
+" g:llama_api_url and g:llama_overrides can be configured in your .vimrc
+" let g:llama_api_url = "192.168.1.10:8080"
+" llama_overrides can also be set through buffer/window scopes. For instance
+" autocmd filetype python let b:llama_overrides = {"temp": 0.2}
+" Could be added to your .vimrc to automatically set a lower temperature when
+" editing a python script
+" Additionally, an override dict can be stored at the top of a file
+" !*{"stop": ["User:"]}
+" Could be added to the start of your chatlog.txt to set the stopping token
+" These parameter dicts are merged together from lowest to highest priority:
+" server default -> g:llama_overrides -> w:llama_overrides ->
+" b:llama_overrides -> in file (!*) overrides
+"
+" Sublists (like logit_bias and stop) are overridden, not merged
+" Example override:
+" !*{"logit_bias": [[13, -5], [2, false]], "temperature": 1, "top_k": 5, "top_p": 0.5, "n_predict": 256, "repeat_last_n": 256, "repeat_penalty": 1.17647}
+if !exists("g:llama_api_url")
+    let g:llama_api_url= "127.0.0.1:8080"
+endif
+if !exists("g:llama_overrides")
+   let g:llama_overrides = {}
+endif
+const s:querydata = {"n_predict": 256, "stop": [ "\n" ], "stream": v:true }
+const s:curlcommand = ['curl','--data-raw', "{\"prompt\":\"### System:\"}", '--silent', '--no-buffer', '--request', 'POST', '--url', g:llama_api_url .. '/completion', '--header', "Content-Type: application/json"]
+let s:linedict = {}
+
+func s:callbackHandler(bufn, channel, msg)
+   if len(a:msg) < 3
+      return
+   elseif a:msg[0] == "d"
+      let l:msg = a:msg[6:-1]
+   else
+      let l:msg = a:msg
+   endif
+   let l:decoded_msg = json_decode(l:msg)
+   let l:newtext = split(l:decoded_msg['content'], "\n", 1)
+   if len(l:newtext) > 0
+      call setbufline(a:bufn, s:linedict[a:bufn], getbufline(a:bufn, s:linedict[a:bufn])[0] .. newtext[0])
+   else
+      echo "nothing genned"
+   endif
+   if len(newtext) > 1
+      let l:failed = appendbufline(a:bufn, s:linedict[a:bufn], newtext[1:-1])
+      let s:linedict[a:bufn] = s:linedict[a:bufn] + len(newtext)-1
+   endif
+   if has_key(l:decoded_msg, "stop") && l:decoded_msg.stop
+       echo "Finished generation"
+   endif
+endfunction
+
+func llama#doLlamaGen()
+   if exists("b:job")
+      if job_status(b:job) == "run"
+         call job_stop(b:job)
+         return
+      endif
+   endif
+
+   let l:cbuffer = bufnr("%")
+   let s:linedict[l:cbuffer] = line('$')
+   let l:buflines = getbufline(l:cbuffer, 1, 1000)
+   let l:querydata = copy(s:querydata)
+   call extend(l:querydata, g:llama_overrides)
+   if exists("w:llama_overrides")
+      call extend(l:querydata, w:llama_overrides)
+   endif
+   if exists("b:llama_overrides")
+      call extend(l:querydata, b:llama_overrides)
+   endif
+   if l:buflines[0][0:1] == '!*'
+      let l:userdata = json_decode(l:buflines[0][2:-1])
+      call extend(l:querydata, l:userdata)
+      let l:buflines = l:buflines[1:-1]
+   endif
+   let l:querydata.prompt = join(l:buflines, "\n")
+   let l:curlcommand = copy(s:curlcommand)
+   let l:curlcommand[2] = json_encode(l:querydata)
+   let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])})
+endfunction
+
+" Echos the tokkenization of the provided string , or cursor to end of word
+" Onus is placed on the user to include the preceding space
+func llama#tokenizeWord(...)
+    if (a:0 > 0)
+        let l:input = a:1
+    else
+        exe "normal \"*ye"
+        let l:input = @*
+    endif
+    let l:querydata = {"content": l:input}
+    let l:curlcommand = copy(s:curlcommand)
+    let l:curlcommand[2] = json_encode(l:querydata)
+    let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
+   let s:token_job = job_start(l:curlcommand, {"callback": function("s:tokenizeWordCallback", [l:input])})
+endfunction
+
+func s:tokenizeWordCallback(plaintext, channel, msg)
+    echo '"' .. a:plaintext ..'" - ' .. string(json_decode(a:msg).tokens)
+endfunction
+
+
+" Echos the token count of the entire buffer (or provided string)
+" Example usage :echo llama#tokenCount()
+func llama#tokenCount(...)
+    if (a:0 > 0)
+        let l:buflines = a:1
+    else
+        let l:buflines = getline(1,1000)
+        if l:buflines[0][0:1] == '!*'
+            let l:buflines = l:buflines[1:-1]
+        endif
+        let l:buflines = join(l:buflines, "\n")
+    endif
+    let l:querydata = {"content": l:buflines}
+    let l:curlcommand = copy(s:curlcommand)
+    let l:curlcommand[2] = json_encode(l:querydata)
+    let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
+   let s:token_job = job_start(l:curlcommand, {"callback": "s:tokenCountCallback"})
+endfunction
+
+func s:tokenCountCallback(channel, msg)
+    let resp = json_decode(a:msg)
+    echo len(resp.tokens)
+endfunction
diff --git a/examples/llm.vim b/examples/llm.vim
deleted file mode 100644
index efecad0cd89f1..0000000000000
--- a/examples/llm.vim
+++ /dev/null
@@ -1,23 +0,0 @@
-function! Llm()
-
-  let url = "http://127.0.0.1:8080/completion"
-
-  " Get the content of the current buffer
-  let buffer_content = join(getline(1, '$'), "\n")
-
-  " Create the JSON payload
-  let json_payload = {"temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":10,"stream": v:false}
-  let json_payload.prompt = buffer_content
-
-  " Define the curl command
-  let curl_command = 'curl -k -s -X POST -H "Content-Type: application/json" -d @- ' . url
-  let response = system(curl_command, json_encode(json_payload))
-
-  " Extract the content field from the response
-  let content = json_decode(response).content
-
-  " Insert the content at the cursor position
-  call setline(line('.'), getline('.') . content)
-endfunction
-
-command! Llm call Llm()

From e7f94d6fdc83b41ba449b4b8c80821673dd12ffc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 8 Aug 2023 15:05:30 +0300
Subject: [PATCH 45/51] vim : bring back simple llm.vim example

---
 examples/llm.vim | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 examples/llm.vim

diff --git a/examples/llm.vim b/examples/llm.vim
new file mode 100644
index 0000000000000..473e0077ad4e9
--- /dev/null
+++ b/examples/llm.vim
@@ -0,0 +1,25 @@
+" Basic plugin example
+
+function! Llm()
+
+  let url = "http://127.0.0.1:8080/completion"
+
+  " Get the content of the current buffer
+  let buffer_content = join(getline(1, '$'), "\n")
+
+  " Create the JSON payload
+  let json_payload = {"temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":10,"stream": v:false}
+  let json_payload.prompt = buffer_content
+
+  " Define the curl command
+  let curl_command = 'curl -k -s -X POST -H "Content-Type: application/json" -d @- ' . url
+  let response = system(curl_command, json_encode(json_payload))
+
+  " Extract the content field from the response
+  let content = json_decode(response).content
+
+  " Insert the content at the cursor position
+  call setline(line('.'), getline('.') . content)
+endfunction
+
+command! Llm call Llm()

From 7ed8d1fe7f8cbe6a6763e6b46759795ac8d21e12 Mon Sep 17 00:00:00 2001
From: chaihahaha <chai836275709@gmail.com>
Date: Tue, 8 Aug 2023 20:07:02 +0800
Subject: [PATCH 46/51] llm.vim : multiline autocompletion, get rid of "^@"
 (#2543)

---
 examples/llm.vim | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/llm.vim b/examples/llm.vim
index 473e0077ad4e9..594a285493dcc 100644
--- a/examples/llm.vim
+++ b/examples/llm.vim
@@ -18,8 +18,10 @@ function! Llm()
   " Extract the content field from the response
   let content = json_decode(response).content
 
+  let split_newlines = split(content, '\n', 1)
+
   " Insert the content at the cursor position
-  call setline(line('.'), getline('.') . content)
+  call setline(line('.'), [ getline('.') . split_newlines[0] ] + split_newlines[1:])
 endfunction
 
 command! Llm call Llm()

From acfc5478ff3446ca3b54553967a3dea09b7c771a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Tue, 8 Aug 2023 14:38:16 +0200
Subject: [PATCH 47/51] CUDA: tighter VRAM scratch size for 65b/70b (#2551)

---
 llama.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 39aefd499dd0c..71061aab910ef 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -149,7 +149,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
 }
 
 // amount of VRAM needed per batch size to hold temporary results
-// the values for 3b and 65b are not derived from testing but instead chosen conservatively
+// the values for 3b are not derived from testing but instead chosen conservatively
 static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
 {
     static std::map<e_model, size_t> k_sizes = {
@@ -157,14 +157,14 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
         { MODEL_7B,   512ull * kB },
         { MODEL_13B,  640ull * kB },
         { MODEL_30B,  768ull * kB },
-        { MODEL_65B, 1536ull * kB },
-        { MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced)
+        { MODEL_65B, 1280ull * kB },
+        { MODEL_70B, 1280ull * kB },
     };
     return k_sizes;
 }
 
 // amount of VRAM needed per batch size and context to hold temporary results
-// the values for 3b and 65b are not derived from testing but instead chosen conservatively
+// the values for 3b are not derived from testing but instead chosen conservatively
 static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
 {
     static std::map<e_model, size_t> k_sizes = {
@@ -172,8 +172,8 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
         { MODEL_7B,  128ull },
         { MODEL_13B, 160ull },
         { MODEL_30B, 208ull },
-        { MODEL_65B, 416ull },
-        { MODEL_70B, 416ull }, // TODO (likely can be reduced)
+        { MODEL_65B, 256ull },
+        { MODEL_70B, 256ull },
     };
     return k_sizes;
 }

From f5bfea0580e417f99850d5456ca541d871a3e48c Mon Sep 17 00:00:00 2001
From: Martin Krasser <krasserm@googlemail.com>
Date: Tue, 8 Aug 2023 15:29:19 +0200
Subject: [PATCH 48/51] Allow passing grammar to completion endpoint (#2532)

* Allow passing grammar to completion endpoint
---
 Makefile                   |  2 +-
 examples/server/README.md  |  2 ++
 examples/server/server.cpp | 60 ++++++++++++++++++++++++++++++++++++--
 3 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 897c5cb9abcca..32598edfe847d 100644
--- a/Makefile
+++ b/Makefile
@@ -380,7 +380,7 @@ embedding: examples/embedding/embedding.cpp                   build-info.h ggml.
 save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o $(OBJS)
+server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
 
 $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
diff --git a/examples/server/README.md b/examples/server/README.md
index aee31ae42e517..e56ca063a9f0e 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -151,6 +151,8 @@ node .
 
     `mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1).
 
+    `grammar`: Set grammar for grammar-based sampling (default: no grammar)
+
     `seed`: Set the random number generator (RNG) seed (default: -1, -1 = random seed).
 
     `ignore_eos`: Ignore end of stream token and continue generating (default: false).
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 6f7a66da108c8..10ae264f516f4 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1,6 +1,7 @@
 #include "common.h"
 #include "llama.h"
 #include "build-info.h"
+#include "grammar-parser.h"
 
 #ifndef NDEBUG
 // crash the server in debug mode, otherwise send an http 500 error
@@ -195,6 +196,8 @@ struct llama_server_context
     llama_context *ctx = nullptr;
     gpt_params params;
 
+    llama_grammar *grammar = nullptr;
+
     bool truncated = false;
     bool stopped_eos = false;
     bool stopped_word = false;
@@ -226,6 +229,7 @@ struct llama_server_context
     void rewind()
     {
         params.antiprompt.clear();
+        params.grammar.clear();
         num_prompt_tokens = 0;
         num_tokens_predicted = 0;
         generated_text = "";
@@ -237,6 +241,7 @@ struct llama_server_context
         stopped_limit = false;
         stopping_word = "";
         multibyte_pending = 0;
+        grammar = nullptr;
 
         n_remain = 0;
         n_past = 0;
@@ -257,6 +262,33 @@ struct llama_server_context
         return true;
     }
 
+    bool loadGrammar()
+    {
+        if (!params.grammar.empty()) {
+            grammar_parser::parse_state parsed_grammar;
+
+            parsed_grammar = grammar_parser::parse(params.grammar.c_str());
+            // will be empty (default) if there are parse errors
+            if (parsed_grammar.rules.empty()) {
+                LOG_ERROR("grammar parse error", {{"grammar", params.grammar}});
+                return false;
+            }
+            grammar_parser::print_grammar(stderr, parsed_grammar);
+
+            {
+                auto it = params.logit_bias.find(llama_token_eos());
+                if (it != params.logit_bias.end() && it->second == -INFINITY) {
+                    LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
+                }
+            }
+
+            std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
+            grammar = llama_grammar_init(
+                grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+        }
+        return true;
+    }
+
     void loadPrompt()
     {
         params.prompt.insert(0, 1, ' '); // always add a first space
@@ -420,6 +452,10 @@ struct llama_server_context
                 logits[llama_token_nl()] = nl_logit;
             }
 
+            if (grammar != nullptr) {
+                llama_sample_grammar(ctx, &candidates_p, grammar);
+            }
+
             if (temp <= 0)
             {
                 // Greedy sampling
@@ -457,10 +493,15 @@ struct llama_server_context
                 }
             }
 
+            if (grammar != nullptr) {
+                llama_grammar_accept_token(ctx, grammar, result.tok);
+            }
+
             for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i)
             {
                 result.probs.push_back({candidates_p.data[i].id, candidates_p.data[i].p});
             }
+
             last_n_tokens.erase(last_n_tokens.begin());
             last_n_tokens.push_back(result.tok);
             num_tokens_predicted++;
@@ -947,6 +988,7 @@ static json format_generation_settings(llama_server_context &llama)
         {"stream", llama.stream},
         {"logit_bias", llama.params.logit_bias},
         {"n_probs", llama.params.n_probs},
+        {"grammar", llama.params.grammar},
     };
 }
 
@@ -1048,6 +1090,7 @@ static void parse_options_completion(const json &body, llama_server_context &lla
     llama.params.n_keep = body.value("n_keep", default_params.n_keep);
     llama.params.seed = body.value("seed", default_params.seed);
     llama.params.prompt = body.value("prompt", default_params.prompt);
+    llama.params.grammar = body.value("grammar", default_params.grammar);
     llama.params.n_probs = body.value("n_probs", default_params.n_probs);
 
     llama.params.logit_bias.clear();
@@ -1179,6 +1222,12 @@ int main(int argc, char **argv)
 
         parse_options_completion(json::parse(req.body), llama);
 
+        if (!llama.loadGrammar())
+        {
+            res.status = 400;
+            return;
+        }
+
         llama.loadPrompt();
         llama.beginCompletion();
 
@@ -1334,8 +1383,12 @@ int main(int argc, char **argv)
 
     svr.set_error_handler([](const Request &, Response &res)
                           {
-        res.set_content("File Not Found", "text/plain");
-        res.status = 404; });
+        if (res.status == 400) {
+            res.set_content("Invalid request", "text/plain");
+        } else {
+            res.set_content("File Not Found", "text/plain");
+            res.status = 404;
+        } });
 
     // set timeouts and change hostname and port
     svr.set_read_timeout(sparams.read_timeout);
@@ -1363,6 +1416,9 @@ int main(int argc, char **argv)
         return 1;
     }
 
+    if (llama.grammar != nullptr) {
+        llama_grammar_free(llama.grammar);
+    }
     llama_backend_free();
 
     return 0;

From 4024f91a665d83b6de8658d45ec9d004c5d90c79 Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Wed, 9 Aug 2023 01:56:44 +0300
Subject: [PATCH 49/51] Add intrinsics polyfills for AMD

---------

Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
---
 CMakeLists.txt |  1 -
 Makefile       |  1 -
 ggml-cuda.cu   | 53 ++++++++++++++++++++++++++++++++++++--------------
 3 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0da4db55820df..5d64cf77001ea 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -379,7 +379,6 @@ if (LLAMA_HIPBLAS)
         target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
         target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
         target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
-        target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
         set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
         target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
 
diff --git a/Makefile b/Makefile
index 450e055fd947f..c3ef75f3fdea0 100644
--- a/Makefile
+++ b/Makefile
@@ -302,7 +302,6 @@ ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
 ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
 ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
 ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y)
-ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_FORCE_DMMV
 ggml-cuda.o: CXXFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(CXX) $(CXXFLAGS) -x hip -c -o $@ $<
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 033df63349704..96e558e4cbbab 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -75,6 +75,29 @@
 
 #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
 
+#if defined(GGML_USE_HIPBLAS)
+#define __CUDA_ARCH__ 1300
+
+typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
+static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
+    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
+    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
+    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
+    return reinterpret_cast<const int&>(c);
+}
+
+static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
+#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
+    c = __builtin_amdgcn_sdot4(a, b, c, false);
+#else
+    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
+    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
+    c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
+#endif
+    return c;
+}
+#endif
+
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
@@ -1396,8 +1419,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
         return;
     }
 
-    y[ib].ds.x = d;
-    y[ib].ds.y = sum;
+    reinterpret_cast<half&>(y[ib].ds.x) = d;
+    reinterpret_cast<half&>(y[ib].ds.y) = sum;
 }
 
 template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
@@ -1609,8 +1632,8 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
 #else
     const float2 dm8f = __half22float2(dm8);
     const float2 ds8f = __half22float2(ds8);
-    const float d8d8 = dm8.x * ds8.x;
-    const float m8s8 = dm8.y * ds8.y;
+    const float d8d8 = __low2float(dm8) * __low2float(ds8);
+    const float m8s8 = __high2float(dm8) * __high2float(ds8);
 #endif // GGML_CUDA_F16
 
     // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
@@ -2380,7 +2403,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
         u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
     }
 
-    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds.x);
+    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
 }
 
 static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
@@ -2478,7 +2501,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
 #pragma unroll
     for (int i = 0; i < QR2_K; ++ i) {
         u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
-        d8[i] = bq8_1[bq8_offset + i].ds.x;
+        d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
     }
 
     return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
@@ -2605,7 +2628,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
 #pragma unroll
     for (int i = 0; i < QR3_K; ++i) {
         u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
-        d8[i] = bq8_1[bq8_offset + i].ds.x;
+        d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
     }
 
     return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
@@ -2782,7 +2805,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
 
     for (int i = 0; i < QR4_K; ++i) {
         const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-        d8[i] = bq8i->ds.x;
+        d8[i] = __low2half(bq8i->ds);
 
         const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
         u[2*i+0] = q8[0];
@@ -2809,8 +2832,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
     const float dall = bq4_K->d[0];
     const float dmin = bq4_K->d[1];
 
-    const float d8_1 = bq8_1[0].ds.x;
-    const float d8_2 = bq8_1[1].ds.x;
+    const float d8_1 = __low2float(bq8_1[0].ds);
+    const float d8_2 = __low2float(bq8_1[1].ds);
 
     const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
     const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
@@ -2977,7 +3000,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
 #pragma unroll
     for (int i = 0; i < QR5_K; ++i) {
         const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-        d8[i] = bq8i->ds.x;
+        d8[i] = __low2float(bq8i->ds);
 
         const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
         u[2*i+0] = q8[0];
@@ -2995,8 +3018,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
 
     const float d = bq5_K->d;
 
-    const float d8_1 = bq8_1[0].ds.x;
-    const float d8_2 = bq8_1[1].ds.x;
+    const float d8_1 = __low2half(bq8_1[0].ds);
+    const float d8_2 = __low2half(bq8_1[1].ds);
 
     const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
     const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
@@ -3157,7 +3180,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
 #pragma unroll
     for (int i = 0; i < QR6_K; ++i) {
         u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
-        d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
+        d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
     }
 
     return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
@@ -3336,7 +3359,7 @@ static __global__ void mul_mat_q(
                 *dsi_dst = *dsi_src;
             } else {
                 float * dfi_dst = (float *) dsi_dst;
-                *dfi_dst = (*dsi_src).x;
+                *dfi_dst = __low2half(*dsi_src);
             }
         }
 

From 25d43e0eb578b6e73046d9d6644a3a14d460600d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Wed, 9 Aug 2023 09:42:34 +0200
Subject: [PATCH 50/51] CUDA: tuned mul_mat_q kernels (#2546)

---
 Makefile     |    5 -
 README.md    |    1 -
 ggml-cuda.cu | 1068 ++++++++++++++++++++++++++++++++------------------
 3 files changed, 682 insertions(+), 392 deletions(-)

diff --git a/Makefile b/Makefile
index 32598edfe847d..f01bf0c8324ed 100644
--- a/Makefile
+++ b/Makefile
@@ -253,11 +253,6 @@ ifdef LLAMA_CUDA_KQUANTS_ITER
 else
 	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
 endif
-ifdef LLAMA_CUDA_MMQ_Y
-	NVCCFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y)
-else
-	NVCCFLAGS += -DGGML_CUDA_MMQ_Y=64
-endif # LLAMA_CUDA_MMQ_Y
 #ifdef LLAMA_CUDA_CUBLAS
 #	NVCCFLAGS += -DGGML_CUDA_CUBLAS
 #endif # LLAMA_CUDA_CUBLAS
diff --git a/README.md b/README.md
index 2ece294b7c947..6900b1152e736 100644
--- a/README.md
+++ b/README.md
@@ -406,7 +406,6 @@ Building the program with BLAS support may lead to some performance improvements
 --->
   | Option                  | Legal values           | Default | Description |
   |-------------------------|------------------------|---------|-------------|
-  | LLAMA_CUDA_MMQ_Y        | Positive integer >= 32 |      64 | Tile size in y direction when using the custom CUDA kernels for prompt processing. Higher values can be faster depending on the amount of shared memory available. Power of 2 heavily recommended. |
   | LLAMA_CUDA_FORCE_DMMV   | Boolean                |   false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
   | LLAMA_CUDA_DMMV_X       | Positive integer >= 32 |      32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
   | LLAMA_CUDA_MMV_Y        | Positive integer       |       1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 9d42efb0d0b03..6390b1158b6a6 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -14,6 +14,7 @@
 #include "ggml.h"
 
 #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
+#define CC_TURING   700
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -262,10 +263,6 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
 #define CUDA_QUANTIZE_BLOCK_SIZE 256
 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
 
-#ifndef GGML_CUDA_MMQ_Y
-#define GGML_CUDA_MMQ_Y 64
-#endif // GGML_CUDA_MMQ_Y
-
 // dmmv = dequantize_mul_mat_vec
 #ifndef GGML_CUDA_DMMV_X
 #define GGML_CUDA_DMMV_X 32
@@ -285,6 +282,20 @@ struct ggml_tensor_extra_gpu {
     cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
 };
 
+static int g_device_count = -1;
+static int g_main_device = 0;
+static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
+static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
+static bool g_mul_mat_q = false;
+
+static void * g_scratch_buffer = nullptr;
+static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
+static size_t g_scratch_offset = 0;
+
+static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
+
+static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
+
 static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
     const int i = blockDim.x*blockIdx.x + threadIdx.x;
 
@@ -1549,8 +1560,8 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
 #else
     const float2 dm8f = __half22float2(dm8);
     const float2 ds8f = __half22float2(ds8);
-    const float d8d8 = dm8.x * ds8.x;
-    const float m8s8 = dm8.y * ds8.y;
+    const float d8d8 = dm8f.x * ds8f.x;
+    const float m8s8 = dm8f.y * ds8f.y;
 #endif // GGML_CUDA_F16
 
     // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
@@ -1884,21 +1895,21 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
     return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
 }
 
-static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
 
-    __shared__ int  tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE)       + GGML_CUDA_MMQ_Y];
-    __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_0) + GGML_CUDA_MMQ_Y/QI4_0];
+    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE)       + mmq_y];
+    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
 
     *x_ql = tile_x_qs;
     *x_dm = (half2 *) tile_x_d;
 }
 
-template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
     const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
     int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
 
     __builtin_assume(i_offset >= 0);
-    __builtin_assume(i_offset <  8);
+    __builtin_assume(i_offset <  nwarps);
     __builtin_assume(k >= 0);
     __builtin_assume(k <  WARP_SIZE);
 
@@ -1910,7 +1921,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
     float * x_dmf = (float *) x_dm;
 
 #pragma unroll
-    for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
         int i = i0 + i_offset;
 
         if (need_check) {
@@ -1920,39 +1931,30 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
         const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
 
         x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
-        x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
+        // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
     }
 
-//     const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
-//     const int kbxd = k % blocks_per_tile_x_row;
+    const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
+    const int kbxd = k % blocks_per_tile_x_row;
 
-// #pragma unroll
-//     for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_0) {
-//         FIXME out-of-bounds
-//         const int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
+        int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
 
-//         if (i >= GGML_CUDA_MMQ_Y) {
-//             return;
-//         }
+        if (need_check) {
+            i = min(i, i_max);
+        }
 
-//         const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
+        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
 
-//         x_dm[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd].x = bxi->d;
-//     }
+        x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
+    }
 }
 
 static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
     const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
 
-    __builtin_assume(i >= 0);
-    __builtin_assume(i <  GGML_CUDA_MMQ_Y);
-    __builtin_assume(j >= 0);
-    __builtin_assume(j <  WARP_SIZE);
-    __builtin_assume(k >= 0);
-    __builtin_assume(k <  WARP_SIZE);
-    __builtin_assume(k % VDR_Q4_0_Q8_1_MMQ == 0);
-
     const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
     const float * x_dmf = (float *) x_dm;
 
@@ -1960,13 +1962,13 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
 
 #pragma unroll
     for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
-        u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_0];
+        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
     }
 
     return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
         (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
-         y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
+         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
 }
 
 static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
@@ -1987,21 +1989,21 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
     return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
 }
 
-static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
 
-    __shared__ int   tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) +     + GGML_CUDA_MMQ_Y];
-    __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_1) + GGML_CUDA_MMQ_Y/QI4_1];
+    __shared__ int   tile_x_qs[mmq_y * (WARP_SIZE) +     + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
 
     *x_ql = tile_x_qs;
     *x_dm = tile_x_dm;
 }
 
-template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
     const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
     int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
 
     __builtin_assume(i_offset >= 0);
-    __builtin_assume(i_offset <  8);
+    __builtin_assume(i_offset <  nwarps);
     __builtin_assume(k >= 0);
     __builtin_assume(k <  WARP_SIZE);
 
@@ -2011,7 +2013,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
     const block_q4_1 * bx0 = (block_q4_1 *) vx;
 
 #pragma unroll
-    for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
         int i = i0 + i_offset;
 
         if (need_check) {
@@ -2027,7 +2029,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
     const int kbxd = k % blocks_per_tile_x_row;
 
 #pragma unroll
-    for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_1) {
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
         int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
 
         if (need_check) {
@@ -2044,27 +2046,19 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
     const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
 
-    __builtin_assume(i >= 0);
-    __builtin_assume(i <  GGML_CUDA_MMQ_Y);
-    __builtin_assume(j >= 0);
-    __builtin_assume(j <  WARP_SIZE);
-    __builtin_assume(k >= 0);
-    __builtin_assume(k <  WARP_SIZE);
-    __builtin_assume(k % VDR_Q4_1_Q8_1_MMQ == 0);
-
     const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
 
     int u[2*VDR_Q4_1_Q8_1_MMQ];
 
 #pragma unroll
     for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
-        u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_1];
+        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
     }
 
     return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
         (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
-         y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
+         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
 }
 
 static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
@@ -2087,21 +2081,21 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
     return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
 }
 
-static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
 
-    __shared__ int  tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE)     + GGML_CUDA_MMQ_Y];
-    __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_0) + GGML_CUDA_MMQ_Y/QI5_0];
+    __shared__ int  tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
+    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
 
     *x_ql = tile_x_ql;
     *x_dm = (half2 *) tile_x_d;
 }
 
-template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
     const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
     int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
 
     __builtin_assume(i_offset >= 0);
-    __builtin_assume(i_offset <  8);
+    __builtin_assume(i_offset <  nwarps);
     __builtin_assume(k >= 0);
     __builtin_assume(k <  WARP_SIZE);
 
@@ -2111,7 +2105,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
     const block_q5_0 * bx0 = (block_q5_0 *) vx;
 
 #pragma unroll
-    for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
         int i = i0 + i_offset;
 
         if (need_check) {
@@ -2147,7 +2141,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
     float * x_dmf = (float *) x_dm;
 
 #pragma unroll
-    for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_0) {
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
         int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
 
         if (need_check) {
@@ -2164,14 +2158,6 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
     const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
 
-    __builtin_assume(i >= 0);
-    __builtin_assume(i <  GGML_CUDA_MMQ_Y);
-    __builtin_assume(j >= 0);
-    __builtin_assume(j <  WARP_SIZE);
-    __builtin_assume(k >= 0);
-    __builtin_assume(k <  WARP_SIZE);
-    __builtin_assume(k % VDR_Q5_0_Q8_1_MMQ == 0);
-
     const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
     const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
     const float * x_dmf = (const float *) x_dm;
@@ -2181,12 +2167,12 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
 
 #pragma unroll
     for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
-        u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_0];
+        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
     }
 
     return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
-        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
+        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
 }
 
 static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
@@ -2209,21 +2195,21 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
     return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
 }
 
-static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
 
-    __shared__ int   tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE)     + GGML_CUDA_MMQ_Y];
-    __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_1) + GGML_CUDA_MMQ_Y/QI5_1];
+    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
 
     *x_ql = tile_x_ql;
     *x_dm = tile_x_dm;
 }
 
-template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
     const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
     int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
 
     __builtin_assume(i_offset >= 0);
-    __builtin_assume(i_offset <  8);
+    __builtin_assume(i_offset < nwarps);
     __builtin_assume(k >= 0);
     __builtin_assume(k <  WARP_SIZE);
 
@@ -2233,7 +2219,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
     const block_q5_1 * bx0 = (block_q5_1 *) vx;
 
 #pragma unroll
-    for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
         int i = i0 + i_offset;
 
         if (need_check) {
@@ -2266,7 +2252,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
     const int kbxd = k % blocks_per_tile_x_row;
 
 #pragma unroll
-    for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_1) {
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
         int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
 
         if (need_check) {
@@ -2283,14 +2269,6 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
     const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
 
-    __builtin_assume(i >= 0);
-    __builtin_assume(i <  GGML_CUDA_MMQ_Y);
-    __builtin_assume(j >= 0);
-    __builtin_assume(j <  WARP_SIZE);
-    __builtin_assume(k >= 0);
-    __builtin_assume(k <  WARP_SIZE);
-    __builtin_assume(k % VDR_Q5_1_Q8_1_MMQ == 0);
-
     const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
     const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
 
@@ -2298,12 +2276,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
 
 #pragma unroll
     for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
-        u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_1];
+        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
     }
 
     return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
-        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
+        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
 }
 
 static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
@@ -2323,21 +2301,21 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
     return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds.x);
 }
 
-static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
 
-    __shared__ int  tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE)       + GGML_CUDA_MMQ_Y];
-    __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI8_0) + GGML_CUDA_MMQ_Y/QI8_0];
+    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE)       + mmq_y];
+    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
 
     *x_ql = tile_x_qs;
     *x_dm = (half2 *) tile_x_d;
 }
 
-template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
     const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
     int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
 
     __builtin_assume(i_offset >= 0);
-    __builtin_assume(i_offset <  8);
+    __builtin_assume(i_offset <  nwarps);
     __builtin_assume(k >= 0);
     __builtin_assume(k <  WARP_SIZE);
 
@@ -2348,7 +2326,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_
     const block_q8_0 * bx0 = (block_q8_0 *) vx;
 
 #pragma unroll
-    for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
         int i = i0 + i_offset;
 
         if (need_check) {
@@ -2358,41 +2336,29 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_
         const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
 
         x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
-        x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbx] = bxi->d;
     }
 
-//     const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
-//     const int kbxd = k % blocks_per_tile_x_row;
+    const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
+    const int kbxd = k % blocks_per_tile_x_row;
 
-// #pragma unroll
-//     for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI8_0) {
-//         FIXME out-of-bounds
-//         const int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
+        int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
 
-// #if GGML_CUDA_MMQ_Y < 64
-//         if (i >= GGML_CUDA_MMQ_Y) {
-//             return;
-//         }
-// #endif // GGML_CUDA_MMQ_Y < 64
+        if (need_check) {
+            i = min(i, i_max);
+        }
 
-//         const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
+        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
 
-//         x_dm[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd].x = bxi->d;
-//     }
+        x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
+    }
 }
 
 static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
     const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
 
-    __builtin_assume(i >= 0);
-    __builtin_assume(i <  GGML_CUDA_MMQ_Y);
-    __builtin_assume(j >= 0);
-    __builtin_assume(j <  WARP_SIZE);
-    __builtin_assume(k >= 0);
-    __builtin_assume(k <  WARP_SIZE);
-    __builtin_assume(k % VDR_Q8_0_Q8_1_MMQ == 0);
-
     const float * x_dmf = (const float *) x_dm;
     const float * y_df  = (const float *) y_ds;
 
@@ -2424,23 +2390,23 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
     return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
 }
 
-static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
 
-    __shared__ int   tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE)       + GGML_CUDA_MMQ_Y];
-    __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI2_K) + GGML_CUDA_MMQ_Y/QI2_K];
-    __shared__ int   tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4)     + GGML_CUDA_MMQ_Y/4];
+    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
+    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/4)     + mmq_y/4];
 
     *x_ql = tile_x_ql;
     *x_dm = tile_x_dm;
     *x_sc = tile_x_sc;
 }
 
-template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
     const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
     int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
 
     __builtin_assume(i_offset >= 0);
-    __builtin_assume(i_offset <  8);
+    __builtin_assume(i_offset <  nwarps);
     __builtin_assume(k >= 0);
     __builtin_assume(k <  WARP_SIZE);
 
@@ -2450,7 +2416,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_
     const block_q2_K * bx0 = (block_q2_K *) vx;
 
 #pragma unroll
-    for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
         int i = i0 + i_offset;
 
         if (need_check) {
@@ -2466,8 +2432,8 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_
     const int kbxd = k % blocks_per_tile_x_row;
 
 #pragma unroll
-    for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI2_K) {
-        int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
+        int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
 
         if (need_check) {
             i = min(i, i_max);
@@ -2479,7 +2445,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_
     }
 
 #pragma unroll
-    for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
         int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
 
         if (need_check) {
@@ -2496,14 +2462,6 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
     const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
 
-    __builtin_assume(i >= 0);
-    __builtin_assume(i <  GGML_CUDA_MMQ_Y);
-    __builtin_assume(j >= 0);
-    __builtin_assume(j <  WARP_SIZE);
-    __builtin_assume(k >= 0);
-    __builtin_assume(k <  WARP_SIZE);
-    __builtin_assume(k % VDR_Q2_K_Q8_1_MMQ == 0);
-
     const int kbx = k / QI2_K;
     const int ky  = (k % QI2_K) * QR2_K;
     const float * y_df = (const float *) y_ds;
@@ -2520,7 +2478,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
 
     const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
 
-    const int index_y = j * (QR2_K*WARP_SIZE) + QR2_K*k;
+    const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
     return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
 }
 
@@ -2551,12 +2509,12 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
     return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
 }
 
-static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
 
-    __shared__ int   tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE)       + GGML_CUDA_MMQ_Y];
-    __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI3_K) + GGML_CUDA_MMQ_Y/QI3_K];
-    __shared__ int   tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2)     + GGML_CUDA_MMQ_Y/2];
-    __shared__ int   tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4)     + GGML_CUDA_MMQ_Y/4];
+    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
+    __shared__ int   tile_x_qh[mmq_y * (WARP_SIZE/2)     + mmq_y/2];
+    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/4)     + mmq_y/4];
 
     *x_ql = tile_x_ql;
     *x_dm = tile_x_dm;
@@ -2564,12 +2522,12 @@ static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 **
     *x_sc = tile_x_sc;
 }
 
-template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
     const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
     int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
 
     __builtin_assume(i_offset >= 0);
-    __builtin_assume(i_offset <  8);
+    __builtin_assume(i_offset <  nwarps);
     __builtin_assume(k >= 0);
     __builtin_assume(k <  WARP_SIZE);
 
@@ -2579,7 +2537,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
     const block_q3_K * bx0 = (block_q3_K *) vx;
 
 #pragma unroll
-    for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
         int i = i0 + i_offset;
 
         if (need_check) {
@@ -2596,8 +2554,8 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
     float * x_dmf = (float *) x_dm;
 
 #pragma unroll
-    for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI3_K) {
-        int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
+        int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
 
         if (need_check) {
             i = min(i, i_max);
@@ -2609,7 +2567,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
     }
 
 #pragma unroll
-    for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 2) {
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
         int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
 
         if (need_check) {
@@ -2623,7 +2581,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
     }
 
 #pragma unroll
-    for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
         int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
 
         if (need_check) {
@@ -2652,14 +2610,6 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
     const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
 
-    __builtin_assume(i >= 0);
-    __builtin_assume(i <  GGML_CUDA_MMQ_Y);
-    __builtin_assume(j >= 0);
-    __builtin_assume(j <  WARP_SIZE);
-    __builtin_assume(k >= 0);
-    __builtin_assume(k <  WARP_SIZE);
-    __builtin_assume(k % VDR_Q3_K_Q8_1_MMQ == 0);
-
     const int kbx  = k / QI3_K;
     const int ky  = (k % QI3_K) * QR3_K;
     const float * x_dmf = (const float *) x_dm;
@@ -2681,7 +2631,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
         v[l] = __vsubss4(vll, vlh);
     }
 
-    const int index_y = j * (QR3_K*WARP_SIZE) + k*QR3_K;
+    const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
     return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
 }
 
@@ -2778,23 +2728,23 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
 #endif
 }
 
-static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
 
-    __shared__ int   tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE)       + GGML_CUDA_MMQ_Y];
-    __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_K) + GGML_CUDA_MMQ_Y/QI4_K];
-    __shared__ int   tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8)     + GGML_CUDA_MMQ_Y/8];
+    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
+    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];
 
     *x_ql = tile_x_ql;
     *x_dm = tile_x_dm;
     *x_sc = tile_x_sc;
 }
 
-template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
     const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
     int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
 
     __builtin_assume(i_offset >= 0);
-    __builtin_assume(i_offset <  8);
+    __builtin_assume(i_offset <  nwarps);
     __builtin_assume(k >= 0);
     __builtin_assume(k <  WARP_SIZE);
 
@@ -2804,7 +2754,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
     const block_q4_K * bx0 = (block_q4_K *) vx;
 
 #pragma unroll
-    for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
         int i = i0 + i_offset;
 
         if (need_check) {
@@ -2820,8 +2770,8 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
     const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
 
 #pragma unroll
-    for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_K) {
-        int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
+        int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
 
         if (need_check) {
             i = min(i, i_max);
@@ -2833,8 +2783,8 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
     }
 
 #pragma unroll
-    for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
-        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
 
         if (need_check) {
             i = min(i, i_max);
@@ -2858,14 +2808,6 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
     const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
 
-    __builtin_assume(i >= 0);
-    __builtin_assume(i <  GGML_CUDA_MMQ_Y);
-    __builtin_assume(j >= 0);
-    __builtin_assume(j <  WARP_SIZE);
-    __builtin_assume(k >= 0);
-    __builtin_assume(k <  WARP_SIZE);
-    __builtin_assume(k % VDR_Q4_K_Q8_1_MMQ == 0);
-
     int v[QR4_K*VDR_Q4_K_Q8_1_MMQ];
 
 #pragma unroll
@@ -2876,7 +2818,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
 
     const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
 
-    const int index_y = j * (QR4_K*WARP_SIZE) + QR4_K*k;
+    const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
     return vec_dot_q4_K_q8_1_impl_mmq(v, &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
 }
 
@@ -2969,23 +2911,23 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
 #endif
 }
 
-static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
 
-    __shared__ int   tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE)     + GGML_CUDA_MMQ_Y];
-    __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_K) + GGML_CUDA_MMQ_Y/QI5_K];
-    __shared__ int   tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8)     + GGML_CUDA_MMQ_Y/8];
+    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
+    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];
 
     *x_ql = tile_x_ql;
     *x_dm = tile_x_dm;
     *x_sc = tile_x_sc;
 }
 
-template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
     const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
     int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
 
     __builtin_assume(i_offset >= 0);
-    __builtin_assume(i_offset <  8);
+    __builtin_assume(i_offset <  nwarps);
     __builtin_assume(k >= 0);
     __builtin_assume(k <  WARP_SIZE);
 
@@ -2995,7 +2937,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
     const block_q5_K * bx0 = (block_q5_K *) vx;
 
 #pragma unroll
-    for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
         int i = i0 + i_offset;
 
         if (need_check) {
@@ -3024,8 +2966,8 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
     const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
 
 #pragma unroll
-    for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_K) {
-        int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
+        int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
 
         if (need_check) {
             i = min(i, i_max);
@@ -3037,8 +2979,8 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
     }
 
 #pragma unroll
-    for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
-        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
 
         if (need_check) {
             i = min(i, i_max);
@@ -3062,18 +3004,10 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
     const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
 
-    __builtin_assume(i >= 0);
-    __builtin_assume(i <  GGML_CUDA_MMQ_Y);
-    __builtin_assume(j >= 0);
-    __builtin_assume(j <  WARP_SIZE);
-    __builtin_assume(k >= 0);
-    __builtin_assume(k <  WARP_SIZE);
-    __builtin_assume(k % VDR_Q5_K_Q8_1_MMQ == 0);
-
     const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
 
-    const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
-    const int index_y = j * (QR5_K*WARP_SIZE)     + QR5_K*k;
+    const int index_x = i * (QR5_K*WARP_SIZE + 1) +  QR5_K*k;
+    const int index_y = j * WARP_SIZE             + (QR5_K*k) % WARP_SIZE;
     return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
 }
 
@@ -3103,23 +3037,23 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
     return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
 }
 
-static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
 
-    __shared__ int   tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE)     + GGML_CUDA_MMQ_Y];
-    __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI6_K) + GGML_CUDA_MMQ_Y/QI6_K];
-    __shared__ int   tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8)     + GGML_CUDA_MMQ_Y/8];
+    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
+    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];
 
     *x_ql = tile_x_ql;
     *x_dm = tile_x_dm;
     *x_sc = tile_x_sc;
 }
 
-template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
     const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
     int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
 
     __builtin_assume(i_offset >= 0);
-    __builtin_assume(i_offset <  8);
+    __builtin_assume(i_offset <  nwarps);
     __builtin_assume(k >= 0);
     __builtin_assume(k <  WARP_SIZE);
 
@@ -3129,7 +3063,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_
     const block_q6_K * bx0 = (block_q6_K *) vx;
 
 #pragma unroll
-    for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
         int i = i0 + i_offset;
 
         if (need_check) {
@@ -3159,8 +3093,8 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_
     float * x_dmf = (float *) x_dm;
 
 #pragma unroll
-    for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI6_K) {
-        int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
+        int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
 
         if (need_check) {
             i = min(i, i_max);
@@ -3172,8 +3106,8 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_
     }
 
 #pragma unroll
-    for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
-        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
 
         if (need_check) {
             i = min(i, i_max);
@@ -3189,25 +3123,17 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
     const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
 
-    __builtin_assume(i >= 0);
-    __builtin_assume(i <  GGML_CUDA_MMQ_Y);
-    __builtin_assume(j >= 0);
-    __builtin_assume(j <  WARP_SIZE);
-    __builtin_assume(k >= 0);
-    __builtin_assume(k <  WARP_SIZE);
-    __builtin_assume(k % VDR_Q6_K_Q8_1_MMQ == 0);
-
     const float * x_dmf = (const float *) x_dm;
     const float * y_df  = (const float *) y_ds;
 
     const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
 
-    const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k;
-    const int index_y = j * (QR6_K*WARP_SIZE)     + QR6_K*k;
+    const int index_x = i * (QR6_K*WARP_SIZE + 1) +  QR6_K*k;
+    const int index_y = j * WARP_SIZE             + (QR6_K*k) % WARP_SIZE;
     return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
 }
 
-template <int qk, int qr, int qi, bool need_sum, typename block_q_t,
+template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
               allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
 static __global__ void mul_mat_q(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
@@ -3222,14 +3148,11 @@ static __global__ void mul_mat_q(
 
     const int & ncols_dst = ncols_y;
 
-    const int tid_x = threadIdx.x;
-    const int tid_y = threadIdx.y;
-
-    const int row_dst_0 = blockIdx.x*GGML_CUDA_MMQ_Y;
+    const int row_dst_0 = blockIdx.x*mmq_y;
     const int & row_x_0 = row_dst_0;
-    const int row_dst = row_dst_0 + tid_x;
+    const int row_dst = row_dst_0 + threadIdx.x;
 
-    const int col_dst_0 = blockIdx.y*WARP_SIZE;
+    const int col_dst_0 = blockIdx.y*mmq_x;
     const int & col_y_0 = col_dst_0;
 
     int   * tile_x_ql = nullptr;
@@ -3239,64 +3162,65 @@ static __global__ void mul_mat_q(
 
     allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
 
-    const int blocks_per_tile_y_col = qr*WARP_SIZE/QI8_1;
-
-    __shared__ int    tile_y_qs[(WARP_SIZE) * (qr*WARP_SIZE)];
-    __shared__ half2  tile_y_ds[(WARP_SIZE) * blocks_per_tile_y_col];
+    __shared__ int    tile_y_qs[mmq_x * WARP_SIZE];
+    __shared__ half2  tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
 
-    float sum[GGML_CUDA_MMQ_Y/WARP_SIZE][4] = {0.0f};
+    float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
 
     for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
 
         load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
-                   tid_y, nrows_x-row_x_0-1, tid_x, blocks_per_row_x);
+                   threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
 
+#pragma unroll
         for (int ir = 0; ir < qr; ++ir) {
-            const int kqs = ir*WARP_SIZE + tid_x;
+            const int kqs = ir*WARP_SIZE + threadIdx.x;
             const int kbxd = kqs / QI8_1;
 
-            for (int i = 0; i < WARP_SIZE; i += 8) {
-                const int col_y_eff = min(col_y_0 + tid_y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
+#pragma unroll
+            for (int i = 0; i < mmq_x; i += nwarps) {
+                const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
 
                 const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
 
-                tile_y_qs[(tid_y + i) * (qr*WARP_SIZE) + kqs] = get_int_from_int8_aligned(by0->qs, tid_x % QI8_1);
+                const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
+                tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
             }
-        }
 
-        for (int ids0 = 0; ids0 < WARP_SIZE; ids0 += 8 * (WARP_SIZE/blocks_per_tile_y_col)) {
-            const int ids = (ids0 + tid_y * (WARP_SIZE/blocks_per_tile_y_col) + tid_x / blocks_per_tile_y_col) % WARP_SIZE;
-            const int kby = tid_x % blocks_per_tile_y_col;
-            const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
-
-            // if the sum is not needed it's faster to transform the scale to f32 ahead of time
-            const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kby].ds;
-            half2       * dsi_dst = &tile_y_ds[ids * (qr*WARP_SIZE/QI8_1) + kby];
-            if (need_sum) {
-                *dsi_dst = *dsi_src;
-            } else {
-                float * dfi_dst = (float *) dsi_dst;
-                *dfi_dst = (*dsi_src).x;
+#pragma unroll
+            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
+                const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
+                const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
+                const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
+
+                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
+                const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
+                half2       * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
+                if (need_sum) {
+                    *dsi_dst = *dsi_src;
+                } else {
+                    float * dfi_dst = (float *) dsi_dst;
+                    *dfi_dst = (*dsi_src).x;
+                }
             }
-        }
 
-        __syncthreads();
+            __syncthreads();
 
-#if __CUDA_ARCH__ >= 700 // Unrolling the loop is slower on Pascal
+// #pragma unroll // unrolling this loop causes too much register pressure
+            for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
 #pragma unroll
-#endif // __CUDA_ARCH__ >= 700
-        for (int k = 0; k < WARP_SIZE; k += vdr) {
+                for (int j = 0; j < mmq_x; j += nwarps) {
 #pragma unroll
-            for (int j = 0; j < WARP_SIZE; j += 8) {
-#pragma unroll
-                for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) {
-                    sum[i/WARP_SIZE][j/8] += vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
-                                                     tid_x + i, tid_y + j, k);
+                    for (int i = 0; i < mmq_y; i += WARP_SIZE) {
+                        sum[i/WARP_SIZE][j/nwarps] += vec_dot(
+                            tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
+                            threadIdx.x + i, threadIdx.y + j, k);
+                    }
                 }
             }
-        }
 
-        __syncthreads();
+            __syncthreads();
+        }
     }
 
 
@@ -3304,15 +3228,15 @@ static __global__ void mul_mat_q(
         return;
     }
 
-    for (int j = 0; j < WARP_SIZE; j += 8) {
-        const int col_dst = col_dst_0 + j + tid_y;
+    for (int j = 0; j < mmq_x; j += nwarps) {
+        const int col_dst = col_dst_0 + j + threadIdx.y;
 
         if (col_dst >= ncols_dst) {
             return;
         }
 
-        for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) {
-            dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/8];
+        for (int i = 0; i < mmq_y; i += WARP_SIZE) {
+            dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/nwarps];
         }
     }
 }
@@ -4014,17 +3938,52 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
     const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
 
-    const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
-    const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
-
-    if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
-        mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, allocate_tiles_q4_0, load_tiles_q4_0<false>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
-            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_compute_capabilities[id];
+
+    if (compute_capability >= CC_TURING) {
+        const int mmq_x  = 64;
+        const int mmq_y  = 128;
+        const int nwarps = 4;
+
+        const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+        const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+        const dim3 block_nums(block_num_x, block_num_y, 1);
+        const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+        if (nrows_x % mmq_y == 0) {
+            const bool need_check = false;
+            mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
+                load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        } else {
+            const bool need_check = true;
+            mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
+                load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        }
     } else {
-        mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, allocate_tiles_q4_0, load_tiles_q4_0<true>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
-            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        const int mmq_x  = 64;
+        const int mmq_y  = 64;
+        const int nwarps = 4;
+
+        const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+        const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+        const dim3 block_nums(block_num_x, block_num_y, 1);
+        const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+        if (nrows_x % mmq_y == 0) {
+            const bool need_check = false;
+            mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
+                load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        } else {
+            const bool need_check = true;
+            mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
+                load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        }
     }
 }
 
@@ -4032,17 +3991,53 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
     const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
 
-    const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
-    const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
-
-    if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
-        mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, allocate_tiles_q4_1, load_tiles_q4_1<false>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
-            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_compute_capabilities[id];
+
+    if (compute_capability >= CC_TURING) {
+        const int mmq_x  = 64;
+        const int mmq_y  = 128;
+        const int nwarps = 4;
+
+        const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+        const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+        const dim3 block_nums(block_num_x, block_num_y, 1);
+        const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+        if (nrows_x % mmq_y == 0) {
+            const bool need_check = false;
+            mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
+                load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        } else {
+            const bool need_check = true;
+            mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
+                load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        }
     } else {
-        mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, allocate_tiles_q4_1, load_tiles_q4_1<true>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
-            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        const int mmq_x  = 64;
+        const int mmq_y  = 64;
+        const int nwarps = 8;
+
+        const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+        const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+        const dim3 block_nums(block_num_x, block_num_y, 1);
+        const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+        if (nrows_x % mmq_y == 0) {
+            const bool need_check = false;
+            mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
+                load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        } else {
+            const bool need_check = true;
+            mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
+                load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        }
+
     }
 }
 
@@ -4050,17 +4045,52 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
     const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
 
-    const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
-    const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
-
-    if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
-        mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, allocate_tiles_q5_0, load_tiles_q5_0<false>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
-            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_compute_capabilities[id];
+
+    if (compute_capability >= CC_TURING) {
+        const int mmq_x  = 128;
+        const int mmq_y  = 64;
+        const int nwarps = 4;
+
+        const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+        const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+        const dim3 block_nums(block_num_x, block_num_y, 1);
+        const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+        if (nrows_x % mmq_y == 0) {
+            const bool need_check = false;
+            mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
+                load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        } else {
+            const bool need_check = true;
+            mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
+                load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        }
     } else {
-        mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, allocate_tiles_q5_0, load_tiles_q5_0<true>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
-            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        const int mmq_x  = 64;
+        const int mmq_y  = 64;
+        const int nwarps = 8;
+
+        const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+        const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+        const dim3 block_nums(block_num_x, block_num_y, 1);
+        const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+        if (nrows_x % mmq_y == 0) {
+            const bool need_check = false;
+            mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
+                load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        } else {
+            const bool need_check = true;
+            mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
+                load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        }
     }
 }
 
@@ -4068,17 +4098,52 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
     const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
 
-    const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
-    const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
-
-    if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
-        mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, allocate_tiles_q5_1, load_tiles_q5_1<false>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
-            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_compute_capabilities[id];
+
+    if (compute_capability >= CC_TURING) {
+        const int mmq_x  = 128;
+        const int mmq_y  = 64;
+        const int nwarps = 8;
+
+        const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+        const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+        const dim3 block_nums(block_num_x, block_num_y, 1);
+        const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+        if (nrows_x % mmq_y == 0) {
+            const bool need_check = false;
+            mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
+                load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        } else {
+            const bool need_check = true;
+            mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
+                load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        }
     } else {
-        mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, allocate_tiles_q5_1, load_tiles_q5_1<true>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
-            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        const int mmq_x  = 64;
+        const int mmq_y  = 64;
+        const int nwarps = 8;
+
+        const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+        const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+        const dim3 block_nums(block_num_x, block_num_y, 1);
+        const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+        if (nrows_x % mmq_y == 0) {
+            const bool need_check = false;
+            mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
+                load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        } else {
+            const bool need_check = true;
+            mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
+                load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        }
     }
 }
 
@@ -4086,17 +4151,52 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
     const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
 
-    const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
-    const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
-
-    if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
-        mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, allocate_tiles_q8_0, load_tiles_q8_0<false>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
-            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_compute_capabilities[id];
+
+    if (compute_capability >= CC_TURING) {
+        const int mmq_x  = 128;
+        const int mmq_y  = 64;
+        const int nwarps = 4;
+
+        const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+        const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+        const dim3 block_nums(block_num_x, block_num_y, 1);
+        const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+        if (nrows_x % mmq_y == 0) {
+            const bool need_check = false;
+            mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
+                load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        } else {
+            const bool need_check = true;
+            mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
+                load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        }
     } else {
-        mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, allocate_tiles_q8_0, load_tiles_q8_0<true>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
-            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        const int mmq_x  = 64;
+        const int mmq_y  = 64;
+        const int nwarps = 8;
+
+        const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+        const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+        const dim3 block_nums(block_num_x, block_num_y, 1);
+        const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+        if (nrows_x % mmq_y == 0) {
+            const bool need_check = false;
+            mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
+                load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        } else {
+            const bool need_check = true;
+            mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
+                load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        }
     }
 }
 
@@ -4104,17 +4204,52 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
     const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
 
-    const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
-    const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
-
-    if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
-        mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, allocate_tiles_q2_K, load_tiles_q2_K<false>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
-            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_compute_capabilities[id];
+
+    if (compute_capability >= CC_TURING) {
+        const int mmq_x  = 64;
+        const int mmq_y  = 128;
+        const int nwarps = 4;
+
+        const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+        const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+        const dim3 block_nums(block_num_x, block_num_y, 1);
+        const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+        if (nrows_x % mmq_y == 0) {
+            const bool need_check = false;
+            mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
+                load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        } else {
+            const bool need_check = true;
+            mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
+                load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        }
     } else {
-        mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, allocate_tiles_q2_K, load_tiles_q2_K<true>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
-            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        const int mmq_x  = 64;
+        const int mmq_y  = 64;
+        const int nwarps = 8;
+
+        const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+        const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+        const dim3 block_nums(block_num_x, block_num_y, 1);
+        const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+        if (nrows_x % mmq_y == 0) {
+            const bool need_check = false;
+            mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
+                load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        } else {
+            const bool need_check = true;
+            mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
+                load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        }
     }
 }
 
@@ -4122,17 +4257,52 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
     const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
 
-    const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
-    const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
-
-    if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
-        mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, allocate_tiles_q3_K, load_tiles_q3_K<false>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
-            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_compute_capabilities[id];
+
+    if (compute_capability >= CC_TURING) {
+        const int mmq_x  = 128;
+        const int mmq_y  = 128;
+        const int nwarps = 4;
+
+        const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+        const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+        const dim3 block_nums(block_num_x, block_num_y, 1);
+        const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+        if (nrows_x % mmq_y == 0) {
+            const bool need_check = false;
+            mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
+                load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        } else {
+            const bool need_check = true;
+            mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
+                load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        }
     } else {
-        mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, allocate_tiles_q3_K, load_tiles_q3_K<true>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
-            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        const int mmq_x  = 64;
+        const int mmq_y  = 64;
+        const int nwarps = 8;
+
+        const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+        const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+        const dim3 block_nums(block_num_x, block_num_y, 1);
+        const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+        if (nrows_x % mmq_y == 0) {
+            const bool need_check = false;
+            mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
+                load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        } else {
+            const bool need_check = true;
+            mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
+                load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        }
     }
 }
 
@@ -4140,17 +4310,52 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
     const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
 
-    const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
-    const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
-
-    if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
-        mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, allocate_tiles_q4_K, load_tiles_q4_K<false>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
-            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_compute_capabilities[id];
+
+    if (compute_capability >= CC_TURING) {
+        const int mmq_x  = 64;
+        const int mmq_y  = 128;
+        const int nwarps = 4;
+
+        const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+        const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+        const dim3 block_nums(block_num_x, block_num_y, 1);
+        const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+        if (nrows_x % mmq_y == 0) {
+            const bool need_check = false;
+            mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
+                load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        } else {
+            const bool need_check = true;
+            mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
+                load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        }
     } else {
-        mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, allocate_tiles_q4_K, load_tiles_q4_K<true>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
-            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        const int mmq_x  = 32;
+        const int mmq_y  = 64;
+        const int nwarps = 8;
+
+        const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+        const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+        const dim3 block_nums(block_num_x, block_num_y, 1);
+        const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+        if (nrows_x % mmq_y == 0) {
+            const bool need_check = false;
+            mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
+                load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        } else {
+            const bool need_check = true;
+            mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
+                load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        }
     }
 }
 
@@ -4158,17 +4363,52 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
     const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
 
-    const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
-    const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
-
-    if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
-        mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, allocate_tiles_q5_K, load_tiles_q5_K<false>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
-            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_compute_capabilities[id];
+
+    if (compute_capability >= CC_TURING) {
+        const int mmq_x  = 64;
+        const int mmq_y  = 128;
+        const int nwarps = 4;
+
+        const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+        const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+        const dim3 block_nums(block_num_x, block_num_y, 1);
+        const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+        if (nrows_x % mmq_y == 0) {
+            const bool need_check = false;
+            mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
+                load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        } else {
+            const bool need_check = true;
+            mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
+                load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        }
     } else {
-        mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, allocate_tiles_q5_K, load_tiles_q5_K<true>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
-            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        const int mmq_x  = 64;
+        const int mmq_y  = 64;
+        const int nwarps = 8;
+
+        const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+        const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+        const dim3 block_nums(block_num_x, block_num_y, 1);
+        const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+        if (nrows_x % mmq_y == 0) {
+            const bool need_check = false;
+            mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
+                load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        } else {
+            const bool need_check = true;
+            mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
+                load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        }
     }
 }
 
@@ -4176,17 +4416,52 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
     const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
 
-    const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
-    const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
-
-    if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
-        mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, allocate_tiles_q6_K, load_tiles_q6_K<false>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
-            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_compute_capabilities[id];
+
+    if (compute_capability >= CC_TURING) {
+        const int mmq_x  = 64;
+        const int mmq_y  = 64;
+        const int nwarps = 4;
+
+        const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+        const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+        const dim3 block_nums(block_num_x, block_num_y, 1);
+        const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+        if (nrows_x % mmq_y == 0) {
+            const bool need_check = false;
+            mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
+                load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        } else {
+            const bool need_check = true;
+            mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
+                load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        }
     } else {
-        mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, allocate_tiles_q6_K, load_tiles_q6_K<true>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
-            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        const int mmq_x  = 32;
+        const int mmq_y  = 64;
+        const int nwarps = 8;
+
+        const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+        const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+        const dim3 block_nums(block_num_x, block_num_y, 1);
+        const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+        if (nrows_x % mmq_y == 0) {
+            const bool need_check = false;
+            mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
+                load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        } else {
+            const bool need_check = true;
+            mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
+                load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        }
     }
 }
 
@@ -4361,20 +4636,6 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
 }
 
 
-static void * g_scratch_buffer = nullptr;
-static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
-static size_t g_scratch_offset = 0;
-
-static int g_device_count = -1;
-static int g_main_device = 0;
-static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
-static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
-static bool g_mul_mat_q = false;
-
-static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
-
-static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
-
 void ggml_init_cublas() {
     static bool initialized = false;
 
@@ -4730,6 +4991,37 @@ inline void ggml_cuda_op_mul_mat_q(
     (void) i1;
 }
 
+static int64_t get_row_rounding(ggml_type type) {
+    int max_compute_capability = INT_MIN;
+    for (int id = 0; id < g_device_count; ++id) {
+        if (max_compute_capability < g_compute_capabilities[id]
+                && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
+            max_compute_capability = g_compute_capabilities[id];
+        }
+    }
+
+    switch(type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+            return max_compute_capability >= CC_TURING ? 128 : 64;
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+            return 64;
+        case GGML_TYPE_F16:
+            return 1;
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+            return max_compute_capability >= CC_TURING ? 128 : 64;
+        case GGML_TYPE_Q6_K:
+            return 64;
+        default:
+            GGML_ASSERT(false);
+    }
+}
+
 inline void ggml_cuda_op_mul_mat_vec(
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
     float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -5130,14 +5422,16 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
 
         int64_t row_low, row_high;
         if (split) {
+            const int64_t rounding = get_row_rounding(src0->type);
+
             row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
-            row_low -= row_low % GGML_CUDA_MMQ_Y;
+            row_low -= row_low % rounding;
 
             if (id == g_device_count - 1) {
                 row_high = nrows0;
             } else {
                 row_high = nrows0*g_tensor_split[id + 1];
-                row_high -= row_high % GGML_CUDA_MMQ_Y;
+                row_high -= row_high % rounding;
             }
         } else {
             row_low = 0;
@@ -5616,14 +5910,16 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
             row_low = 0;
             row_high = nrows;
         } else if (backend == GGML_BACKEND_GPU_SPLIT) {
+            const int64_t rounding = get_row_rounding(tensor->type);
+
             row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
-            row_low -= row_low % GGML_CUDA_MMQ_Y;
+            row_low -= row_low % rounding;
 
             if (id == g_device_count - 1) {
                 row_high = nrows;
             } else {
                 row_high = nrows*g_tensor_split[id + 1];
-                row_high -= row_high % GGML_CUDA_MMQ_Y;
+                row_high -= row_high % rounding;
             }
         } else {
             GGML_ASSERT(false);

From 8f8ab6c4c049df501e9a5ed8fef3aa0fc0691421 Mon Sep 17 00:00:00 2001
From: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Date: Wed, 9 Aug 2023 18:05:03 -0500
Subject: [PATCH 51/51] hipLDFLAG Path change Unix to multisystem in Makefile

changed the hardcoded linux distro hipblas LD path from -L/opt/rocm/lib to use the defined ROCM_PATH variable to be flexible with ROCm on non-Linux OS
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 469d62de73bc3..0843d86c6b8d6 100644
--- a/Makefile
+++ b/Makefile
@@ -290,7 +290,7 @@ ifdef LLAMA_HIPBLAS
 	LLAMA_CUDA_KQUANTS_ITER ?= 2
 	CFLAGS     += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
 	CXXFLAGS   += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
-	LDFLAGS    += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 -lrocblas
+	LDFLAGS    += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 -lrocblas
 	OBJS       += ggml-cuda.o
 ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
 ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)