From 0fd8363adc3f248820f3908d5845c61c2fa36f6f Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Thu, 20 Apr 2023 02:04:00 +0300 Subject: [PATCH 01/51] use hipblas based on cublas --- CMakeLists.txt | 26 ++++++++++++++++++++++++++ Makefile | 4 ++++ ggml-cuda.cu | 6 ++++++ ggml.c | 35 +++++++++++++++++++++++++++++++++-- 4 files changed, 69 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1f9fdd30f0830..57cce9bb05d76 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -67,6 +67,7 @@ endif() option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) option(LLAMA_OPENBLAS "llama: use OpenBLAS" OFF) option(LLAMA_CUBLAS "llama: use cuBLAS" OFF) +option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) @@ -168,6 +169,31 @@ if (LLAMA_CUBLAS) endif() endif() +if (LLAMA_HIPBLAS) + cmake_minimum_required(VERSION 3.21) + + find_package(hip) + find_package(hipblas) + + if (hipblas_FOUND) + message(STATUS "hipBLAS found") + + set(LLAMA_HIPBLAS_PLATFORM "AMD" CACHE STRING "hip device type" FORCE) + set_property(CACHE LLAMA_HIPBLAS_PLATFORM PROPERTY STRINGS "AMD" "NVIDIA") + + add_compile_definitions(GGML_USE_HIPBLAS "__HIP_PLATFORM_${LLAMA_HIPBLAS_PLATFORM}__") + + add_library(ggml-hip OBJECT ggml-cuda.cu) + set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) + target_link_libraries(ggml-hip hip::device) + + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::host roc::hipblas ggml-hip) + + else() + message(WARNING "hipBLAS not found") + endif() +endif() + if (LLAMA_ALL_WARNINGS) if (NOT MSVC) set(c_flags diff --git a/Makefile b/Makefile index f267d086415ee..d2f30e1cc9674 100644 --- a/Makefile +++ b/Makefile @@ -107,6 +107,10 @@ ifdef LLAMA_CUBLAS ggml-cuda.o: ggml-cuda.cu ggml-cuda.h nvcc -arch=native -c -o $@ $< endif +ifdef LLAMA_HIPBLAS + CFLAGS += -DGGML_USE_HIPBLAS -D__HIP_PLATFORM_AMD__ -I/opt/rocm/include + LDFLAGS += -lhipblas -lamdhip64 -L/opt/rocm/lib +endif ifdef LLAMA_GPROF CFLAGS += -pg CXXFLAGS += -pg diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 0baa989a36ca9..90830e5fd2976 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1,5 +1,11 @@ #include +#if defined(__HIP_PLATFORM_AMD__) +#include "hip/hip_runtime.h" +#define cudaStream_t hipStream_t +#define __half _Float16 +#else #include +#endif #include "ggml-cuda.h" typedef uint16_t ggml_fp16_t; diff --git a/ggml.c b/ggml.c index da0f5d1d549ab..23befa297136d 100644 --- a/ggml.c +++ b/ggml.c @@ -147,9 +147,41 @@ inline static void* ggml_aligned_malloc(size_t size) { #include #elif defined(GGML_USE_OPENBLAS) #include -#elif defined(GGML_USE_CUBLAS) +#elif defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) + +#if defined(GGML_USE_HIPBLAS) +#include "hipblas/hipblas.h" +#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F +#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_T HIPBLAS_OP_T +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define cublasCreate hipblasCreate +#define cublasGemmEx hipblasGemmEx +#define cublasHandle_t hipblasHandle_t +#define cublasSetStream hipblasSetStream +#define cublasSgemm hipblasSgemm +#define cublasStatus_t hipblasStatus_t +#define CUDA_R_16F HIPBLAS_R_16F +#define CUDA_R_32F HIPBLAS_R_32F +#define cudaError_t hipError_t +#define cudaFree hipFree +#define cudaGetErrorString hipGetErrorString +#define cudaGetLastError hipGetLastError +#define cudaMalloc hipMalloc +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaStream_t hipStream_t +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags +#define cudaStreamNonBlocking hipStreamNonBlocking +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaSuccess hipSuccess +#define GGML_USE_CUBLAS +#else #include #include +#endif #include "ggml-cuda.h" #define CUDA_CHECK(err) \ @@ -8073,7 +8105,6 @@ static void ggml_compute_forward_mul_mat_q_f32( const float * x = wdata; #endif - #if defined(GGML_USE_CUBLAS) // copy data to device CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, cudaStream)); From 54a63c10e85bf454eb1ea99cc27d89cce06144b6 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Thu, 20 Apr 2023 22:19:22 +0300 Subject: [PATCH 02/51] Update Makefile for the Cuda kernels --- Makefile | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index d2f30e1cc9674..8e0ada581e698 100644 --- a/Makefile +++ b/Makefile @@ -108,8 +108,15 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h nvcc -arch=native -c -o $@ $< endif ifdef LLAMA_HIPBLAS - CFLAGS += -DGGML_USE_HIPBLAS -D__HIP_PLATFORM_AMD__ -I/opt/rocm/include - LDFLAGS += -lhipblas -lamdhip64 -L/opt/rocm/lib + ROCMPATH?= /opt/rocm + CFLAGS += -DGGML_USE_HIPBLAS -D__HIP_PLATFORM_AMD__ -I$(ROCMPATH)/include + CXXFLAGS+= -D__HIP_PLATFORM_AMD__ -I$(ROCMPATH)/include + HIPFLAGS?= -amdgpu-early-inline-all=true -amdgpu-function-calls=false -march=native + LDFLAGS += -lhipblas -lamdhip64 -L$(ROCMPATH)/lib + HIPCC ?= $(ROCMPATH)/bin/hipcc + OBJS += ggml-cuda.o +ggml-cuda.o: ggml-cuda.cu ggml-cuda.h + $(HIPCC) $(CXXFLAGS) -x hip $(HIPFLAGS) -c -o $@ $< endif ifdef LLAMA_GPROF CFLAGS += -pg From 0e005f779357c9594b942adaf8d985edb071642a Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Fri, 21 Apr 2023 02:13:00 +0300 Subject: [PATCH 03/51] Build file changes Now HIP Clang is not required, the CMake scripts will configure the needed compiler, which can be system clang++. Also other code can still use GCC, but CMake will force the clang to link. --- CMakeLists.txt | 24 ++++++++++-------------- Makefile | 13 ++++++------- 2 files changed, 16 insertions(+), 21 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 57cce9bb05d76..cea51078dcdd9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -170,27 +170,23 @@ if (LLAMA_CUBLAS) endif() if (LLAMA_HIPBLAS) - cmake_minimum_required(VERSION 3.21) - find_package(hip) find_package(hipblas) - if (hipblas_FOUND) + if (${hipblas_FOUND} AND ${hip_FOUND}) message(STATUS "hipBLAS found") + add_compile_definitions(GGML_USE_HIPBLAS) + enable_language(HIP) + add_library(ggml-hip OBJECT ggml-cuda.cu ggml-cuda.h) + set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE HIP) + target_link_libraries(ggml-hip PRIVATE hip::device) - set(LLAMA_HIPBLAS_PLATFORM "AMD" CACHE STRING "hip device type" FORCE) - set_property(CACHE LLAMA_HIPBLAS_PLATFORM PROPERTY STRINGS "AMD" "NVIDIA") - - add_compile_definitions(GGML_USE_HIPBLAS "__HIP_PLATFORM_${LLAMA_HIPBLAS_PLATFORM}__") - - add_library(ggml-hip OBJECT ggml-cuda.cu) - set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) - target_link_libraries(ggml-hip hip::device) - + if (LLAMA_STATIC) + message(FATAL_ERROR "Static linking not supported for HIP/ROCm") + endif() set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::host roc::hipblas ggml-hip) - else() - message(WARNING "hipBLAS not found") + message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm") endif() endif() diff --git a/Makefile b/Makefile index 8e0ada581e698..5b856a3677b96 100644 --- a/Makefile +++ b/Makefile @@ -108,13 +108,12 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h nvcc -arch=native -c -o $@ $< endif ifdef LLAMA_HIPBLAS - ROCMPATH?= /opt/rocm - CFLAGS += -DGGML_USE_HIPBLAS -D__HIP_PLATFORM_AMD__ -I$(ROCMPATH)/include - CXXFLAGS+= -D__HIP_PLATFORM_AMD__ -I$(ROCMPATH)/include - HIPFLAGS?= -amdgpu-early-inline-all=true -amdgpu-function-calls=false -march=native - LDFLAGS += -lhipblas -lamdhip64 -L$(ROCMPATH)/lib - HIPCC ?= $(ROCMPATH)/bin/hipcc - OBJS += ggml-cuda.o + ROCM_PATH ?= /opt/rocm + LDFLAGS += -lhipblas -lamdhip64 -L$(ROCM_PATH)/lib + HIPCC ?= $(ROCM_PATH)/bin/hipcc + OBJS += ggml-cuda.o +ggml.o: CFLAGS += -DGGML_USE_HIPBLAS -D__HIP_PLATFORM_AMD__ -I$(ROCM_PATH)/include +ggml-cuda.o: CXXFLAGS += -march=native -D__HIP_PLATFORM_AMD__ -I$(ROCMPATH)/include ggml-cuda.o: ggml-cuda.cu ggml-cuda.h $(HIPCC) $(CXXFLAGS) -x hip $(HIPFLAGS) -c -o $@ $< endif From d3e1984ce0df5af62ab69c1bdd55a743af4157cc Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Fri, 21 Apr 2023 03:32:06 +0300 Subject: [PATCH 04/51] add rpath --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 5b856a3677b96..d27f716ec1fe1 100644 --- a/Makefile +++ b/Makefile @@ -109,7 +109,7 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h endif ifdef LLAMA_HIPBLAS ROCM_PATH ?= /opt/rocm - LDFLAGS += -lhipblas -lamdhip64 -L$(ROCM_PATH)/lib + LDFLAGS += -lhipblas -lamdhip64 -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib HIPCC ?= $(ROCM_PATH)/bin/hipcc OBJS += ggml-cuda.o ggml.o: CFLAGS += -DGGML_USE_HIPBLAS -D__HIP_PLATFORM_AMD__ -I$(ROCM_PATH)/include From 367723544c2187a2a6cd5954ca37a8faf5335e5a Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sat, 22 Apr 2023 23:28:00 +0300 Subject: [PATCH 05/51] More build file changes --- CMakeLists.txt | 15 ++++++++++++--- Makefile | 21 ++++++++++++--------- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cea51078dcdd9..2c1958f6acfbc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -170,16 +170,24 @@ if (LLAMA_CUBLAS) endif() if (LLAMA_HIPBLAS) + list(APPEND CMAKE_PREFIX_PATH /opt/rocm) + + if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang") + message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang") + endif() + if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") + message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++") + endif() + find_package(hip) find_package(hipblas) if (${hipblas_FOUND} AND ${hip_FOUND}) message(STATUS "hipBLAS found") add_compile_definitions(GGML_USE_HIPBLAS) - enable_language(HIP) add_library(ggml-hip OBJECT ggml-cuda.cu ggml-cuda.h) - set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE HIP) - target_link_libraries(ggml-hip PRIVATE hip::device) + set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) + target_link_libraries(ggml-hip PUBLIC hip::device) if (LLAMA_STATIC) message(FATAL_ERROR "Static linking not supported for HIP/ROCm") @@ -188,6 +196,7 @@ if (LLAMA_HIPBLAS) else() message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm") endif() + endif() if (LLAMA_ALL_WARNINGS) diff --git a/Makefile b/Makefile index d27f716ec1fe1..5339d5765082c 100644 --- a/Makefile +++ b/Makefile @@ -13,8 +13,8 @@ ifndef UNAME_M UNAME_M := $(shell uname -m) endif -CCV := $(shell $(CC) --version | head -n 1) -CXXV := $(shell $(CXX) --version | head -n 1) +CCV = $(shell $(CC) --version | head -n 1) +CXXV = $(shell $(CXX) --version | head -n 1) # Mac OS + Arm can report x86_64 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 @@ -108,14 +108,17 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h nvcc -arch=native -c -o $@ $< endif ifdef LLAMA_HIPBLAS - ROCM_PATH ?= /opt/rocm - LDFLAGS += -lhipblas -lamdhip64 -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib - HIPCC ?= $(ROCM_PATH)/bin/hipcc - OBJS += ggml-cuda.o -ggml.o: CFLAGS += -DGGML_USE_HIPBLAS -D__HIP_PLATFORM_AMD__ -I$(ROCM_PATH)/include -ggml-cuda.o: CXXFLAGS += -march=native -D__HIP_PLATFORM_AMD__ -I$(ROCMPATH)/include + ROCM_PATH ?= /opt/rocm + CC := $(ROCM_PATH)/llvm/bin/clang + CXX := $(ROCM_PATH)/llvm/bin/clang++ + GPU_TARGETS!= $(ROCM_PATH)/llvm/bin/offload-arch + CFLAGS += -DGGML_USE_HIPBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) + CXXFLAGS += -DGGML_USE_HIPBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) + LDFLAGS += -L/opt/rocm/lib -lhipblas -lamdhip64 + OBJS += ggml-cuda.o +ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) ggml-cuda.o: ggml-cuda.cu ggml-cuda.h - $(HIPCC) $(CXXFLAGS) -x hip $(HIPFLAGS) -c -o $@ $< + $(CXX) $(CXXFLAGS) -x hip -c -o $@ $< endif ifdef LLAMA_GPROF CFLAGS += -pg From 3a004b2a0166e412d8d54052c50bfd093611ad95 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Mon, 24 Apr 2023 02:24:54 +0300 Subject: [PATCH 06/51] add rpath --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e9b9018acd726..4af8aa78ad878 100644 --- a/Makefile +++ b/Makefile @@ -120,7 +120,7 @@ ifdef LLAMA_HIPBLAS GPU_TARGETS!= $(ROCM_PATH)/llvm/bin/offload-arch CFLAGS += -DGGML_USE_HIPBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) CXXFLAGS += -DGGML_USE_HIPBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) - LDFLAGS += -L/opt/rocm/lib -lhipblas -lamdhip64 + LDFLAGS += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 OBJS += ggml-cuda.o ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) ggml-cuda.o: ggml-cuda.cu ggml-cuda.h From 608aa33d9f0ee8a7183ed4f9fb62532a65f5b097 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Tue, 25 Apr 2023 21:15:04 +0300 Subject: [PATCH 07/51] change default GPU arch to match CMake --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 4af8aa78ad878..bd0139ed4c695 100644 --- a/Makefile +++ b/Makefile @@ -117,7 +117,7 @@ ifdef LLAMA_HIPBLAS ROCM_PATH ?= /opt/rocm CC := $(ROCM_PATH)/llvm/bin/clang CXX := $(ROCM_PATH)/llvm/bin/clang++ - GPU_TARGETS!= $(ROCM_PATH)/llvm/bin/offload-arch + GPU_TARGETS = gfx900 gfx906 gfx908 gfx90a gfx1030 CFLAGS += -DGGML_USE_HIPBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) CXXFLAGS += -DGGML_USE_HIPBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) LDFLAGS += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 From ecc056519fd08363922875b23956a13a7b6fbdcf Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Fri, 28 Apr 2023 01:58:27 +0300 Subject: [PATCH 08/51] only .cu file needs to be complied as device --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cf087905cbf37..b1fd6e218c661 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -187,7 +187,7 @@ if (LLAMA_HIPBLAS) add_compile_definitions(GGML_USE_HIPBLAS) add_library(ggml-hip OBJECT ggml-cuda.cu ggml-cuda.h) set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) - target_link_libraries(ggml-hip PUBLIC hip::device) + target_link_libraries(ggml-hip PRIVATE hip::device) if (LLAMA_STATIC) message(FATAL_ERROR "Static linking not supported for HIP/ROCm") From a1caa486113eb3d1192c6d554feaff7419194313 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Fri, 28 Apr 2023 10:08:21 +0300 Subject: [PATCH 09/51] add more cuda defines This is so 'slaren/cuda-f16f32' would merge. --- ggml-cuda.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ggml-cuda.h b/ggml-cuda.h index c00d83ba64748..6ab5b3944301d 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -6,19 +6,28 @@ #define CUBLAS_OP_N HIPBLAS_OP_N #define CUBLAS_OP_T HIPBLAS_OP_T #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_TF32_TENSOR_OP_MATH 0 #define cublasCreate hipblasCreate #define cublasGemmEx hipblasGemmEx #define cublasHandle_t hipblasHandle_t +#define cublasSetMathMode(h, m) HIPBLAS_STATUS_SUCCESS #define cublasSetStream hipblasSetStream #define cublasSgemm hipblasSgemm #define cublasStatus_t hipblasStatus_t #define CUDA_R_16F HIPBLAS_R_16F #define CUDA_R_32F HIPBLAS_R_32F +#define cudaDeviceSynchronize hipDeviceSynchronize #define cudaError_t hipError_t +#define cudaEvent_t hipEvent_t +#define cudaEventCreateWithFlags hipEventCreateWithFlags +#define cudaEventDisableTiming hipEventDisableTiming +#define cudaEventRecord hipEventRecord #define cudaFree hipFree +#define cudaFreeHost hipFreeHost #define cudaGetErrorString hipGetErrorString #define cudaGetLastError hipGetLastError #define cudaMalloc hipMalloc +#define cudaMallocHost hipMallocHost #define cudaMemcpyAsync hipMemcpyAsync #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost #define cudaMemcpyHostToDevice hipMemcpyHostToDevice @@ -26,6 +35,7 @@ #define cudaStreamCreateWithFlags hipStreamCreateWithFlags #define cudaStreamNonBlocking hipStreamNonBlocking #define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamWaitEvent hipStreamWaitEvent #define cudaSuccess hipSuccess #define GGML_USE_CUBLAS #else From 04c0d480d780b7e43f9cd5726b1c1d66570b57d8 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Thu, 4 May 2023 12:31:16 +0300 Subject: [PATCH 10/51] Move all HIP stuff to ggml-cuda.cu --- CMakeLists.txt | 10 +++++----- ggml-cuda.cu | 44 +++++++++++++++++++++++++++++++++++++++++--- ggml-cuda.h | 46 ---------------------------------------------- 3 files changed, 46 insertions(+), 54 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e01bb2edd4815..79393a54e4ee9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -232,16 +232,16 @@ if (LLAMA_HIPBLAS) find_package(hipblas) if (${hipblas_FOUND} AND ${hip_FOUND}) - message(STATUS "hipBLAS found") - add_compile_definitions(GGML_USE_HIPBLAS) - add_library(ggml-hip OBJECT ggml-cuda.cu ggml-cuda.h) + message(STATUS "HIP and hipBLAS found") + add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS) + add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h) set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) - target_link_libraries(ggml-hip PRIVATE hip::device) + target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::hipblas) if (LLAMA_STATIC) message(FATAL_ERROR "Static linking not supported for HIP/ROCm") endif() - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::host roc::hipblas ggml-hip) + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-rocm) else() message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm") endif() diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 033c7d5c88ff0..9007f6dcbf626 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -5,9 +5,47 @@ #include #if defined(GGML_USE_HIPBLAS) -#include "hip/hip_runtime.h" -#include "hipblas/hipblas.h" -#include "hip/hip_fp16.h" +#include +#include +#include +#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F +#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F +#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_T HIPBLAS_OP_T +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_TF32_TENSOR_OP_MATH 0 +#define CUDA_R_16F HIPBLAS_R_16F +#define CUDA_R_32F HIPBLAS_R_32F +#define cublasCreate hipblasCreate +#define cublasGemmEx hipblasGemmEx +#define cublasHandle_t hipblasHandle_t +#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS +#define cublasSetStream hipblasSetStream +#define cublasSgemm hipblasSgemm +#define cublasStatus_t hipblasStatus_t +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaError_t hipError_t +#define cudaEventCreateWithFlags hipEventCreateWithFlags +#define cudaEventDisableTiming hipEventDisableTiming +#define cudaEventRecord hipEventRecord +#define cudaEvent_t hipEvent_t +#define cudaFree hipFree +#define cudaFreeHost hipHostFree +#define cudaGetErrorString hipGetErrorString +#define cudaGetLastError hipGetLastError +#define cudaMalloc hipMalloc +#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocPortable) +#define cudaMemcpy2DAsync hipMemcpy2DAsync +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags +#define cudaStreamNonBlocking hipStreamNonBlocking +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamWaitEvent hipStreamWaitEvent +#define cudaStream_t hipStream_t +#define cudaSuccess hipSuccess #else #include #include diff --git a/ggml-cuda.h b/ggml-cuda.h index 0e740e30908bc..f7d6a8bc1842a 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -1,49 +1,3 @@ -#if defined(GGML_USE_HIPBLAS) -#include "hipblas/hipblas.h" -#include "hip/hip_runtime.h" -#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F -#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F -#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT -#define CUBLAS_OP_N HIPBLAS_OP_N -#define CUBLAS_OP_T HIPBLAS_OP_T -#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS -#define CUBLAS_TF32_TENSOR_OP_MATH 0 -#define CUDA_R_16F HIPBLAS_R_16F -#define CUDA_R_32F HIPBLAS_R_32F -#define cublasCreate hipblasCreate -#define cublasGemmEx hipblasGemmEx -#define cublasHandle_t hipblasHandle_t -#define cublasSetMathMode(h, m) HIPBLAS_STATUS_SUCCESS -#define cublasSetStream hipblasSetStream -#define cublasSgemm hipblasSgemm -#define cublasStatus_t hipblasStatus_t -#define cudaDeviceSynchronize hipDeviceSynchronize -#define cudaError_t hipError_t -#define cudaEventCreateWithFlags hipEventCreateWithFlags -#define cudaEventDisableTiming hipEventDisableTiming -#define cudaEventRecord hipEventRecord -#define cudaEvent_t hipEvent_t -#define cudaFree hipFree -#define cudaFreeHost hipFreeHost -#define cudaGetErrorString hipGetErrorString -#define cudaGetLastError hipGetLastError -#define cudaMalloc hipMalloc -#define cudaMallocHost hipMallocHost -#define cudaMemcpy2DAsync hipMemcpy2DAsync -#define cudaMemcpyAsync hipMemcpyAsync -#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost -#define cudaMemcpyHostToDevice hipMemcpyHostToDevice -#define cudaStreamCreateWithFlags hipStreamCreateWithFlags -#define cudaStreamNonBlocking hipStreamNonBlocking -#define cudaStreamSynchronize hipStreamSynchronize -#define cudaStreamWaitEvent hipStreamWaitEvent -#define cudaStream_t hipStream_t -#define cudaSuccess hipSuccess -#define GGML_USE_CUBLAS -#else -#include -#include -#endif #include "ggml.h" #ifdef __cplusplus From baeb482a9429cb7d962da34e9820e62d14ffbe31 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sun, 7 May 2023 12:24:12 +0300 Subject: [PATCH 11/51] Revert to default copy --- ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 9007f6dcbf626..7760f0de133a5 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -35,7 +35,7 @@ #define cudaGetErrorString hipGetErrorString #define cudaGetLastError hipGetLastError #define cudaMalloc hipMalloc -#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocPortable) +#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) #define cudaMemcpy2DAsync hipMemcpy2DAsync #define cudaMemcpyAsync hipMemcpyAsync #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost From 070cbcc1bd7f1b5049feec43507a320d22aac815 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sun, 7 May 2023 18:10:56 +0300 Subject: [PATCH 12/51] occupanct function --- ggml-cuda.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 7760f0de133a5..1b862fe82dff2 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -26,10 +26,10 @@ #define cublasStatus_t hipblasStatus_t #define cudaDeviceSynchronize hipDeviceSynchronize #define cudaError_t hipError_t +#define cudaEvent_t hipEvent_t #define cudaEventCreateWithFlags hipEventCreateWithFlags #define cudaEventDisableTiming hipEventDisableTiming #define cudaEventRecord hipEventRecord -#define cudaEvent_t hipEvent_t #define cudaFree hipFree #define cudaFreeHost hipHostFree #define cudaGetErrorString hipGetErrorString @@ -40,11 +40,12 @@ #define cudaMemcpyAsync hipMemcpyAsync #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost #define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize +#define cudaStream_t hipStream_t #define cudaStreamCreateWithFlags hipStreamCreateWithFlags #define cudaStreamNonBlocking hipStreamNonBlocking #define cudaStreamSynchronize hipStreamSynchronize #define cudaStreamWaitEvent hipStreamWaitEvent -#define cudaStream_t hipStream_t #define cudaSuccess hipSuccess #else #include From 0fe6384755b478bd57c38e626db36f144c617b40 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Fri, 12 May 2023 17:22:11 +0300 Subject: [PATCH 13/51] fix makefile --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 80413517ff8f2..976eefab4f783 100644 --- a/Makefile +++ b/Makefile @@ -140,8 +140,8 @@ ifdef LLAMA_HIPBLAS CC := $(ROCM_PATH)/llvm/bin/clang CXX := $(ROCM_PATH)/llvm/bin/clang++ GPU_TARGETS = gfx900 gfx906 gfx908 gfx90a gfx1030 - CFLAGS += -DGGML_USE_HIPBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) - CXXFLAGS += -DGGML_USE_HIPBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) + CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) + CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) LDFLAGS += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 OBJS += ggml-cuda.o ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) From b19fefef943d974db2eda8a8908e67e1d08e317c Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sat, 20 May 2023 23:28:08 +0300 Subject: [PATCH 14/51] Forwardcompat --- ggml-cuda.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 7f0975a615c5a..44d0fa0489ccb 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -17,6 +17,7 @@ #define CUBLAS_TF32_TENSOR_OP_MATH 0 #define CUDA_R_16F HIPBLAS_R_16F #define CUDA_R_32F HIPBLAS_R_32F +#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) #define cublasCreate hipblasCreate #define cublasGemmEx hipblasGemmEx #define cublasHandle_t hipblasHandle_t From 600ace39c8f1d311b8f3c49003f5a6448a44b18e Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sat, 20 May 2023 23:42:20 +0300 Subject: [PATCH 15/51] update warp size --- ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 44d0fa0489ccb..64ddc68ccd6d0 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -132,7 +132,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo #define CUDA_MUL_BLOCK_SIZE 256 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256 -#define CUDA_DMMV_BLOCK_SIZE 32 // dmmv = dequantize_mul_mat_vec +#define CUDA_DMMV_BLOCK_SIZE 64 // dmmv = dequantize_mul_mat_vec static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) { const int i = blockDim.x*blockIdx.x + threadIdx.x; From a593a4f6c24389528a5eed8e6dc86eb06ced38b8 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Fri, 26 May 2023 00:55:28 +0300 Subject: [PATCH 16/51] Add missing parameters --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 18b67b6699c1d..7c0fb0573d2fc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -233,6 +233,8 @@ if (LLAMA_HIPBLAS) message(STATUS "HIP and hipBLAS found") add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS) add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h) + target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) + target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y}) set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::hipblas) From 30d921af3e0b21f511652c98448ccb631434d0d4 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Fri, 26 May 2023 01:03:56 +0300 Subject: [PATCH 17/51] and makefile --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index 6bb362cb20d6f..ea6ee20414b2e 100644 --- a/Makefile +++ b/Makefile @@ -169,6 +169,8 @@ ifdef LLAMA_HIPBLAS LDFLAGS += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 OBJS += ggml-cuda.o ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) +ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=64 +ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_Y=1 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h $(CXX) $(CXXFLAGS) -x hip -c -o $@ $< endif From 4c8b3fb1071dff0cd0c4b4f96e506294ba6473f4 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Fri, 26 May 2023 01:08:53 +0300 Subject: [PATCH 18/51] add configurable vars --- Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index ea6ee20414b2e..a5dd2a3042b2d 100644 --- a/Makefile +++ b/Makefile @@ -164,13 +164,15 @@ ifdef LLAMA_HIPBLAS CC := $(ROCM_PATH)/llvm/bin/clang CXX := $(ROCM_PATH)/llvm/bin/clang++ GPU_TARGETS = gfx900 gfx906 gfx908 gfx90a gfx1030 + LLAMA_CUDA_DMMV_X ?= 64 + LLAMA_CUDA_DMMV_Y ?= 1 CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) LDFLAGS += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 OBJS += ggml-cuda.o ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) -ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=64 -ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_Y=1 +ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) +ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y) ggml-cuda.o: ggml-cuda.cu ggml-cuda.h $(CXX) $(CXXFLAGS) -x hip -c -o $@ $< endif From 9fdaa1d2501a2c4a030af6d34e97b2e4766b27c4 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sat, 27 May 2023 19:17:53 +0300 Subject: [PATCH 19/51] Add more defs For forward compatibility #1607 --- ggml-cuda.cu | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 4b4c678ead265..1253f086189ba 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -25,14 +25,18 @@ #define cublasSetStream hipblasSetStream #define cublasSgemm hipblasSgemm #define cublasStatus_t hipblasStatus_t +#define cudaDeviceProp hipDeviceProp_t #define cudaDeviceSynchronize hipDeviceSynchronize #define cudaError_t hipError_t -#define cudaEvent_t hipEvent_t #define cudaEventCreateWithFlags hipEventCreateWithFlags #define cudaEventDisableTiming hipEventDisableTiming #define cudaEventRecord hipEventRecord +#define cudaEvent_t hipEvent_t #define cudaFree hipFree #define cudaFreeHost hipHostFree +#define cudaGetDevice hipGetDevice +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaGetDeviceProperties hipGetDeviceProperties #define cudaGetErrorString hipGetErrorString #define cudaGetLastError hipGetLastError #define cudaMalloc hipMalloc @@ -43,11 +47,12 @@ #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost #define cudaMemcpyHostToDevice hipMemcpyHostToDevice #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize -#define cudaStream_t hipStream_t +#define cudaSetDevice hipSetDevice #define cudaStreamCreateWithFlags hipStreamCreateWithFlags #define cudaStreamNonBlocking hipStreamNonBlocking #define cudaStreamSynchronize hipStreamSynchronize #define cudaStreamWaitEvent hipStreamWaitEvent +#define cudaStream_t hipStream_t #define cudaSuccess hipSuccess #else #include From 5d6eb72164e5ae000d07dd725e635faa7a2f723d Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Tue, 6 Jun 2023 18:32:41 +0300 Subject: [PATCH 20/51] warp size fixes --- ggml-cuda.cu | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 8b2fc690e03e4..3a5e1527fb5f7 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -182,7 +182,11 @@ typedef struct { } block_q6_k; static_assert(sizeof(block_q6_k) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_k block size/padding"); +#if defined(GGML_USE_HIPBLAS) +#define WARP_SIZE warpSize +#else #define WARP_SIZE 32 +#endif #define CUDA_MUL_BLOCK_SIZE 256 @@ -679,8 +683,8 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, // sum up partial sums and write back result __syncthreads(); #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + for (int mask = WARP_SIZE/2; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, WARP_SIZE); } if (tid == 0) { From 1ba4ce4ad792f9672eecc37bf982386d3a007914 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Tue, 6 Jun 2023 18:41:08 +0300 Subject: [PATCH 21/51] Revert "warp size fixes" It seems like 32 is faster for me, at least and it won't cause so many conflicts. This reverts commit 5d6eb72164e5ae000d07dd725e635faa7a2f723d. --- ggml-cuda.cu | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 3a5e1527fb5f7..8b2fc690e03e4 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -182,11 +182,7 @@ typedef struct { } block_q6_k; static_assert(sizeof(block_q6_k) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_k block size/padding"); -#if defined(GGML_USE_HIPBLAS) -#define WARP_SIZE warpSize -#else #define WARP_SIZE 32 -#endif #define CUDA_MUL_BLOCK_SIZE 256 @@ -683,8 +679,8 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, // sum up partial sums and write back result __syncthreads(); #pragma unroll - for (int mask = WARP_SIZE/2; mask > 0; mask >>= 1) { - tmp += __shfl_xor_sync(0xffffffff, tmp, mask, WARP_SIZE); + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); } if (tid == 0) { From fa5b3d7365266a9903450c1105551ffec7f51d92 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Tue, 6 Jun 2023 18:47:00 +0300 Subject: [PATCH 22/51] fix makefile. --- Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/Makefile b/Makefile index 94946d6f92522..0b2849712af89 100644 --- a/Makefile +++ b/Makefile @@ -196,7 +196,6 @@ ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y) ggml-cuda.o: ggml-cuda.cu ggml-cuda.h $(CXX) $(CXXFLAGS) -x hip -c -o $@ $< -endif endif # LLAMA_HIPBLAS ifdef LLAMA_METAL From 61df8e92179b84af9041e53f61d0194dfd791de0 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Wed, 14 Jun 2023 22:46:10 +0300 Subject: [PATCH 23/51] add cudaMemset --- ggml-cuda.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index fe55cc8cf2743..e54ea6d469863 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -50,6 +50,7 @@ #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost #define cudaMemcpyHostToDevice hipMemcpyHostToDevice #define cudaMemcpyKind hipMemcpyKind +#define cudaMemset hipMemset #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize #define cudaSetDevice hipSetDevice #define cudaStreamCreateWithFlags hipStreamCreateWithFlags From bb16effc750e2706050f5d4ec89cecc42cc13882 Mon Sep 17 00:00:00 2001 From: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed, 28 Jun 2023 15:27:10 -0500 Subject: [PATCH 24/51] headers fix; add kquants_iter for hipblas and add gfx803 (#1) * kquants_iter for hipblas and add gfx803 * Update CMakeLists.txt with hipblas kquants_iter and DMMV_F16 * remove dmmv_f16 for now --- CMakeLists.txt | 1 + Makefile | 11 ++++++++--- ggml.c | 6 ++++-- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 296f5043001db..23c28c3589ac1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -335,6 +335,7 @@ if (LLAMA_HIPBLAS) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y}) set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::hipblas) + add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) if (LLAMA_STATIC) message(FATAL_ERROR "Static linking not supported for HIP/ROCm") diff --git a/Makefile b/Makefile index f49c57edf5ece..49bbfaf4e72fb 100644 --- a/Makefile +++ b/Makefile @@ -21,8 +21,8 @@ ifndef UNAME_M UNAME_M := $(shell uname -m) endif -CCV = $(shell $(CC) --version | head -n 1) -CXXV = $(shell $(CXX) --version | head -n 1) +CCV := $(shell $(CC) --version | head -n 1) +CXXV := $(shell $(CXX) --version | head -n 1) # Mac OS + Arm can report x86_64 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 @@ -207,13 +207,18 @@ ifdef LLAMA_HIPBLAS ROCM_PATH ?= /opt/rocm CC := $(ROCM_PATH)/llvm/bin/clang CXX := $(ROCM_PATH)/llvm/bin/clang++ - GPU_TARGETS = gfx900 gfx906 gfx908 gfx90a gfx1030 + GPU_TARGETS = gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 LLAMA_CUDA_DMMV_X ?= 32 LLAMA_CUDA_DMMV_Y ?= 1 CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) LDFLAGS += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 OBJS += ggml-cuda.o +ifdef LLAMA_CUDA_KQUANTS_ITER + CXXFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) +else + CXXFLAGS += -DK_QUANTS_PER_ITERATION=2 +endif ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y) diff --git a/ggml.c b/ggml.c index 89379516e2bcd..5713a9f43569f 100644 --- a/ggml.c +++ b/ggml.c @@ -230,9 +230,11 @@ inline static void* ggml_aligned_malloc(size_t size) { #endif #elif defined(GGML_USE_OPENBLAS) #include -#elif defined(GGML_USE_CUBLAS) | defined(GGML_USE_HIPBLAS) +#endif +#if defined(GGML_USE_CUBLAS) #include "ggml-cuda.h" -#elif defined(GGML_USE_CLBLAST) +#endif +#if defined(GGML_USE_CLBLAST) #include "ggml-opencl.h" #endif From c3e3733c61f7705ea00fd593ee94527da8c12f1b Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sun, 2 Jul 2023 15:51:31 +0300 Subject: [PATCH 25/51] ROCm fixes --- ggml-cuda.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index dca90d9997ea7..8fc37ba1bb9fb 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -34,6 +34,7 @@ #define cudaEventDisableTiming hipEventDisableTiming #define cudaEventRecord hipEventRecord #define cudaEvent_t hipEvent_t +#define cudaEventDestroy hipEventDestroy #define cudaFree hipFree #define cudaFreeHost hipHostFree #define cudaGetDevice hipGetDevice @@ -56,7 +57,7 @@ #define cudaStreamCreateWithFlags hipStreamCreateWithFlags #define cudaStreamNonBlocking hipStreamNonBlocking #define cudaStreamSynchronize hipStreamSynchronize -#define cudaStreamWaitEvent hipStreamWaitEvent +#define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0) #define cudaStream_t hipStream_t #define cudaSuccess hipSuccess #else From e610466307abc8f8bae641682ab3f91dbc33930e Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Tue, 11 Jul 2023 17:53:14 +0300 Subject: [PATCH 26/51] Expand arch list and make it overrideable --- Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 38d65aebc6c01..d3fc7c4c6c785 100644 --- a/Makefile +++ b/Makefile @@ -213,10 +213,10 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h endif # LLAMA_CLBLAST ifdef LLAMA_HIPBLAS - ROCM_PATH ?= /opt/rocm - CC := $(ROCM_PATH)/llvm/bin/clang - CXX := $(ROCM_PATH)/llvm/bin/clang++ - GPU_TARGETS = gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 + ROCM_PATH ?= /opt/rocm + CC := $(ROCM_PATH)/llvm/bin/clang + CXX := $(ROCM_PATH)/llvm/bin/clang++ + GPU_TARGETS ?= gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 LLAMA_CUDA_DMMV_X ?= 32 LLAMA_CUDA_DMMV_Y ?= 1 CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) From afcb8fe0c4f5e918422ea41d08824653d58575ed Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Tue, 11 Jul 2023 18:09:27 +0300 Subject: [PATCH 27/51] Add new config option --- CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 34d4a33fe4fd2..54c091413d987 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -367,6 +367,9 @@ if (LLAMA_HIPBLAS) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y}) target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) + if (LLAMA_CUDA_FORCE_DMMV) + target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV) + endif() set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::hipblas) From 2ec4466db54fd2f42f2ab7713cc1061e0cf59bf3 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Thu, 13 Jul 2023 13:44:02 +0300 Subject: [PATCH 28/51] Update build flags. GGML_CUDA_DMMV_Y is now GGML_CUDA_MMV_Y so update your build instructions. GGML_CUDA_FORCE_DMMV is always enabled. --------- Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> --- CMakeLists.txt | 6 ++---- Makefile | 14 ++++++-------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9f9c55a671c93..016d850f4466e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -365,11 +365,9 @@ if (LLAMA_HIPBLAS) add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS) add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) - target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y}) + target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) - if (LLAMA_CUDA_FORCE_DMMV) - target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV) - endif() + target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV) set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::hipblas) diff --git a/Makefile b/Makefile index 88cc288aadb20..039a75365d18e 100644 --- a/Makefile +++ b/Makefile @@ -226,20 +226,18 @@ ifdef LLAMA_HIPBLAS CC := $(ROCM_PATH)/llvm/bin/clang CXX := $(ROCM_PATH)/llvm/bin/clang++ GPU_TARGETS ?= gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 - LLAMA_CUDA_DMMV_X ?= 32 - LLAMA_CUDA_DMMV_Y ?= 1 + LLAMA_CUDA_DMMV_X ?= 32 + LLAMA_CUDA_MMV_Y ?= 1 + LLAMA_CUDA_KQUANTS_ITER ?= 2 CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) LDFLAGS += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 OBJS += ggml-cuda.o -ifdef LLAMA_CUDA_KQUANTS_ITER - CXXFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) -else - CXXFLAGS += -DK_QUANTS_PER_ITERATION=2 -endif ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) -ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y) +ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) +ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_FORCE_DMMV +ggml-cuda.o: CXXFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) ggml-cuda.o: ggml-cuda.cu ggml-cuda.h $(CXX) $(CXXFLAGS) -x hip -c -o $@ $< endif # LLAMA_HIPBLAS From 1f6294dc4473701b5be791d47e4b3733f95dbc0a Mon Sep 17 00:00:00 2001 From: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Mon, 24 Jul 2023 03:52:01 -0500 Subject: [PATCH 29/51] Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5) * initialize rocblas --- ggml-cuda.cu | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index f6426d4bad168..cac029b480b7a 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -10,6 +10,7 @@ #include #include #include +#include "rocblas/rocblas.h" #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT @@ -2531,6 +2532,10 @@ void ggml_init_cublas() { static bool initialized = false; if (!initialized) { +#ifdef GGML_USE_HIPBLAS + rocblas_initialize(); + hipDeviceSynchronize(); +#endif CUDA_CHECK(cudaGetDeviceCount(&g_device_count)); GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES); int64_t total_vram = 0; From 8e8054ad83e794b261914ad4f337d43e2c76882d Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Mon, 24 Jul 2023 12:20:49 +0300 Subject: [PATCH 30/51] Add rocblas to build files --- CMakeLists.txt | 3 ++- Makefile | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 016d850f4466e..0488443249560 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -359,6 +359,7 @@ if (LLAMA_HIPBLAS) find_package(hip) find_package(hipblas) + find_package(rocblas) if (${hipblas_FOUND} AND ${hip_FOUND}) message(STATUS "HIP and hipBLAS found") @@ -369,7 +370,7 @@ if (LLAMA_HIPBLAS) target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV) set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) - target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::hipblas) + target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas) if (LLAMA_STATIC) message(FATAL_ERROR "Static linking not supported for HIP/ROCm") diff --git a/Makefile b/Makefile index 4adaaaad87922..5cf0943fac552 100644 --- a/Makefile +++ b/Makefile @@ -228,7 +228,7 @@ ifdef LLAMA_HIPBLAS LLAMA_CUDA_KQUANTS_ITER ?= 2 CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) - LDFLAGS += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 + LDFLAGS += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 -lrocblas OBJS += ggml-cuda.o ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) From f8e3fc6c746b37d69656fb5ae6af8e411d85dbca Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sat, 29 Jul 2023 14:16:46 +0300 Subject: [PATCH 31/51] rocblas init stuff --- ggml-cuda.cu | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 91e6c078ecc45..cd122c5be6155 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -10,7 +10,10 @@ #include #include #include +#ifdef __HIP_PLATFORM_AMD__ +// for rocblas_initialize() #include "rocblas/rocblas.h" +#endif #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT @@ -2746,10 +2749,14 @@ void ggml_init_cublas() { static bool initialized = false; if (!initialized) { -#ifdef GGML_USE_HIPBLAS - rocblas_initialize(); - hipDeviceSynchronize(); + +#ifdef __HIP_PLATFORM_AMD__ + // Workaround for a rocBLAS bug when using multiple graphics cards: + // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346 + rocblas_initialize(); + CUDA_CHECK(cudaDeviceSynchronize()); #endif + CUDA_CHECK(cudaGetDeviceCount(&g_device_count)); GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES); int64_t total_vram = 0; From 4336231a32a0c6168da5d79801752289622e9e58 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sat, 29 Jul 2023 18:35:56 +0300 Subject: [PATCH 32/51] add hipBLAS to README --------- Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com> --- README.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/README.md b/README.md index 6a3268d129b55..05c9d3b5df799 100644 --- a/README.md +++ b/README.md @@ -408,6 +408,35 @@ Building the program with BLAS support may lead to some performance improvements | LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. | | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | +- #### hipBLAS + + This provide BLAS acceleation on HIP supported GPU like AMD GPU. + Make sure to have ROCm installed. + You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html). + Windows support is coming soon... + + - Using `make`: + ```bash + make LLAMA_HIPBLAS=1 + ``` + - Using `CMake`: + ```bash + mkdir build + cd build + CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON + cmake --build . + ``` + + The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used. + If your GPU is not officialy supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3. + The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above): + + | Option | Legal values | Default | Description | + |-------------------------|------------------------|---------|-------------| + | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | + | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | + | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | + - #### CLBlast OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU. From c1cb70d64d307d3fd9b7b9f61bb574e36520499a Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Mon, 31 Jul 2023 19:56:44 +0300 Subject: [PATCH 33/51] new build arg LLAMA_CUDA_MMQ_Y --- CMakeLists.txt | 1 + Makefile | 2 ++ README.md | 7 ++++--- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 969a3de693dea..14eefe0051670 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -375,6 +375,7 @@ if (LLAMA_HIPBLAS) message(STATUS "HIP and hipBLAS found") add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS) add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h) + target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y}) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) diff --git a/Makefile b/Makefile index 21fed1a46d1a5..cdb12a872f784 100644 --- a/Makefile +++ b/Makefile @@ -270,6 +270,7 @@ ifdef LLAMA_HIPBLAS GPU_TARGETS ?= gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 LLAMA_CUDA_DMMV_X ?= 32 LLAMA_CUDA_MMV_Y ?= 1 + LLAMA_CUDA_MMQ_Y ?= 64 LLAMA_CUDA_KQUANTS_ITER ?= 2 CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) @@ -278,6 +279,7 @@ ifdef LLAMA_HIPBLAS ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) +ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_FORCE_DMMV ggml-cuda.o: CXXFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) ggml-cuda.o: ggml-cuda.cu ggml-cuda.h diff --git a/README.md b/README.md index a56b64a88dbd0..947c2b6940d26 100644 --- a/README.md +++ b/README.md @@ -437,9 +437,10 @@ Building the program with BLAS support may lead to some performance improvements | Option | Legal values | Default | Description | |-------------------------|------------------------|---------|-------------| - | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | - | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | - | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | + | LLAMA_CUDA_MMQ_Y | Positive integer >= 32 | 64 | Tile size in y direction when using the custom HIP kernels for prompt processing. Higher values can be faster depending on the amount of shared memory available. Power of 2 heavily recommended. | + | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | + | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | + | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | - #### CLBlast From d91456aaf138566fa0aa3d507964049c8a09499b Mon Sep 17 00:00:00 2001 From: ardfork <134447697+ardfork@users.noreply.github.com> Date: Mon, 31 Jul 2023 20:35:00 +0300 Subject: [PATCH 34/51] fix half2 decomposition --- ggml-cuda.cu | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index e62891d60f47a..f19c7c7c71ead 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -472,8 +472,8 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){ const block_q4_1 * x = (const block_q4_1 *) vx; - const dfloat d = x[ib].dm.x; - const dfloat m = x[ib].dm.y; + const dfloat d = __low2half(x[ib].dm); + const dfloat m = __high2half(x[ib].dm); const int vui = x[ib].qs[iqs]; @@ -515,8 +515,8 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){ const block_q5_1 * x = (const block_q5_1 *) vx; - const dfloat d = x[ib].dm.x; - const dfloat m = x[ib].dm.y; + const dfloat d = __low2half(x[ib].dm); + const dfloat m = __high2half(x[ib].dm); uint32_t qh; memcpy(&qh, x[ib].qh, sizeof(qh)); @@ -568,8 +568,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float const uint8_t q = x[i].qs[32*n + l]; float * y = yy + i*QK_K + 128*n; - float dall = x[i].dm.x; - float dmin = x[i].dm.y; + float dall = __low2half(x[i].dm); + float dmin = __high2half(x[i].dm); y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4); y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4); @@ -579,8 +579,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float const int il = tid%16; // 0...15 const uint8_t q = x[i].qs[il] >> (2*is); float * y = yy + i*QK_K + 16*is + il; - float dall = x[i].dm.x; - float dmin = x[i].dm.y; + float dall = __low2half(x[i].dm); + float dmin = __high2half(x[i].dm); y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4); #endif @@ -666,8 +666,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float float * y = yy + i*QK_K + 64*il + n*ir; - const float dall = x[i].dm.x; - const float dmin = x[i].dm.y; + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); const uint8_t * q = x[i].qs + 32*il + n*ir; @@ -705,8 +705,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float float * y = yy + i*QK_K + 64*il + 2*ir; - const float dall = x[i].dm.x; - const float dmin = x[i].dm.y; + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); const uint8_t * ql = x[i].qs + 32*il + 2*ir; const uint8_t * qh = x[i].qh + 2*ir; @@ -818,8 +818,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * y = yy + i * QK_K + y_offset; const uint8_t * q = x[i].qs + q_offset; - const float dall = x[i].dm.x; - const float dmin = x[i].dm.y; + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset); aux[0] = a[0] & 0x0f0f0f0f; @@ -1039,8 +1039,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * y1 = yy + i*QK_K + y_offset; const float * y2 = y1 + 128; - const float dall = x[i].dm.x; - const float dmin = x[i].dm.y; + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); const uint16_t * a = (const uint16_t *)x[i].scales; aux[0] = a[im+0] & kmask1; @@ -1172,8 +1172,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * y1 = yy + i*QK_K + y_offset; const float * y2 = y1 + 128; - const float dall = x[i].dm.x; - const float dmin = x[i].dm.y; + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); const uint16_t * a = (const uint16_t *)x[i].scales; aux[0] = a[im+0] & kmask1; From 7297128db8159c7b12db4c28a4532b993025c2e5 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Mon, 7 Aug 2023 08:35:53 +0300 Subject: [PATCH 35/51] [Zig] Rewrite build for Zig 0.11 (#2514) * zig build fixes * Disable LTO on Windows. --- build.zig | 135 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 77 insertions(+), 58 deletions(-) diff --git a/build.zig b/build.zig index 2287d2a2c442b..04c88d8a2bddc 100644 --- a/build.zig +++ b/build.zig @@ -1,68 +1,87 @@ +// Compatible with Zig Version 0.11.0 const std = @import("std"); -const commit_hash = @embedFile(".git/refs/heads/master"); +const Compile = std.Build.Step.Compile; +const ConfigHeader = std.Build.Step.ConfigHeader; +const Mode = std.builtin.Mode; +const CrossTarget = std.zig.CrossTarget; -// Zig Version: 0.11.0-dev.3986+e05c242cd -pub fn build(b: *std.build.Builder) void { - const target = b.standardTargetOptions(.{}); - const optimize = b.standardOptimizeOption(.{}); +const Maker = struct { + builder: *std.build.Builder, + target: CrossTarget, + optimize: Mode, + config_header: *ConfigHeader, + + const cflags = .{"-std=c11"}; + const cxxflags = .{"-std=c++11"}; + + fn init(builder: *std.build.Builder) Maker { + const commit_hash = @embedFile(".git/refs/heads/master"); + const config_header = builder.addConfigHeader( + .{ .style = .blank, .include_path = "build-info.h" }, + .{ + .BUILD_NUMBER = 0, + .BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline + }, + ); + return Maker{ + .builder = builder, + .target = builder.standardTargetOptions(.{}), + .optimize = builder.standardOptimizeOption(.{}), + .config_header = config_header, + }; + } - const config_header = b.addConfigHeader( - .{ .style = .blank, .include_path = "build-info.h" }, - .{ - .BUILD_NUMBER = 0, - .BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline - }, - ); + fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile { + const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize }); + if (std.mem.endsWith(u8, src, ".c")) { + o.addCSourceFiles(&.{src}, &cflags); + o.linkLibC(); + } else { + o.addCSourceFiles(&.{src}, &cxxflags); + o.linkLibCpp(); + } + o.addIncludePath(.{ .path = "." }); + o.addIncludePath(.{ .path = "./examples" }); + return o; + } + + fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile { + const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize }); + e.addIncludePath(.{ .path = "." }); + e.addIncludePath(.{ .path = "./examples" }); + e.addCSourceFiles(&.{src}, &cxxflags); + for (deps) |d| e.addObject(d); + e.linkLibC(); + e.linkLibCpp(); + e.addConfigHeader(m.config_header); + m.builder.installArtifact(e); - const lib = b.addStaticLibrary(.{ - .name = "llama", - .target = target, - .optimize = optimize, - }); - lib.linkLibC(); - lib.linkLibCpp(); - lib.addIncludePath("."); - lib.addIncludePath("./examples"); - lib.addConfigHeader(config_header); - lib.addCSourceFiles(&.{"ggml.c"}, &.{"-std=c11"}); - lib.addCSourceFiles(&.{"llama.cpp"}, &.{"-std=c++11"}); - b.installArtifact(lib); + // Currently a bug is preventing correct linking for optimized builds for Windows: + // https://github.com/ziglang/zig/issues/15958 + if (e.target.isWindows()) { + e.want_lto = false; + } + return e; + } +}; - const examples = .{ - "main", - "baby-llama", - "embedding", - "metal", - "perplexity", - "quantize", - "quantize-stats", - "save-load-state", - "server", - "simple", - "train-text-from-scratch", - }; +pub fn build(b: *std.build.Builder) void { + const make = Maker.init(b); - inline for (examples) |example_name| { - const exe = b.addExecutable(.{ - .name = example_name, - .target = target, - .optimize = optimize, - }); - exe.addIncludePath("."); - exe.addIncludePath("./examples"); - exe.addConfigHeader(config_header); - exe.addCSourceFiles(&.{ - std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{ example_name, example_name }), - "examples/common.cpp", - }, &.{"-std=c++11"}); - exe.linkLibrary(lib); - b.installArtifact(exe); + const ggml = make.obj("ggml", "ggml.c"); + const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c"); + const llama = make.obj("llama", "llama.cpp"); + const common = make.obj("common", "examples/common.cpp"); + const grammar_parser = make.obj("grammar-parser", "examples/grammar-parser.cpp"); - const run_cmd = b.addRunArtifact(exe); - run_cmd.step.dependOn(b.getInstallStep()); - if (b.args) |args| run_cmd.addArgs(args); + _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser }); + _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama }); + _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common }); + _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common }); + _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama }); - const run_step = b.step("run-" ++ example_name, "Run the app"); - run_step.dependOn(&run_cmd.step); + const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser }); + if (server.target.isWindows()) { + server.linkSystemLibrary("ws2_32"); } } From 34a14b28ff7f3c98730339bacee035091b2a812a Mon Sep 17 00:00:00 2001 From: GiviMAD Date: Sun, 6 Aug 2023 23:21:46 -0700 Subject: [PATCH 36/51] [Makefile] Move ARM CFLAGS before compilation (#2536) --- Makefile | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/Makefile b/Makefile index e0528aeee24c0..897c5cb9abcca 100644 --- a/Makefile +++ b/Makefile @@ -142,6 +142,28 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64)) #CXXFLAGS += -mssse3 endif +ifneq ($(filter aarch64%,$(UNAME_M)),) + # Apple M1, M2, etc. + # Raspberry Pi 3, 4, Zero 2 (64-bit) + CFLAGS += -mcpu=native + CXXFLAGS += -mcpu=native +endif + +ifneq ($(filter armv6%,$(UNAME_M)),) + # Raspberry Pi 1, Zero + CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access +endif + +ifneq ($(filter armv7%,$(UNAME_M)),) + # Raspberry Pi 2 + CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations +endif + +ifneq ($(filter armv8%,$(UNAME_M)),) + # Raspberry Pi 3, 4, Zero 2 (32-bit) + CFLAGS += -mfp16-format=ieee -mno-unaligned-access +endif + ifneq ($(filter ppc64%,$(UNAME_M)),) POWER9_M := $(shell grep "POWER9" /proc/cpuinfo) ifneq (,$(findstring POWER9,$(POWER9_M))) @@ -270,28 +292,6 @@ ifdef LLAMA_METAL OBJS += ggml-metal.o endif # LLAMA_METAL -ifneq ($(filter aarch64%,$(UNAME_M)),) - # Apple M1, M2, etc. - # Raspberry Pi 3, 4, Zero 2 (64-bit) - CFLAGS += -mcpu=native - CXXFLAGS += -mcpu=native -endif - -ifneq ($(filter armv6%,$(UNAME_M)),) - # Raspberry Pi 1, Zero - CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -endif - -ifneq ($(filter armv7%,$(UNAME_M)),) - # Raspberry Pi 2 - CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations -endif - -ifneq ($(filter armv8%,$(UNAME_M)),) - # Raspberry Pi 3, 4, Zero 2 (32-bit) - CFLAGS += -mfp16-format=ieee -mno-unaligned-access -endif - ifdef LLAMA_METAL ggml-metal.o: ggml-metal.m ggml-metal.h $(CC) $(CFLAGS) -c $< -o $@ From f6f9896ac3d2ff207e18f87dab85d126ceef5236 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 7 Aug 2023 10:52:57 +0300 Subject: [PATCH 37/51] metal : fix out-of-bounds access + inc concurrency nodes (#2416) * metal : fix out-of-bounds access + style changes * metal : increase concurrency nodes to 2*GGML_MAX_NODES --- ggml-metal.m | 57 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 18 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 3f098d39677a0..b47a98e214b61 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -7,6 +7,11 @@ #import #import +#undef MIN +#undef MAX +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + #ifdef GGML_METAL_NDEBUG #define metal_printf(...) #else @@ -15,6 +20,8 @@ #define UNUSED(x) (void)(x) +#define GGML_MAX_CONCUR (2*GGML_MAX_NODES) + struct ggml_metal_buffer { const char * name; @@ -36,7 +43,7 @@ int n_buffers; struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS]; - int concur_list[GGML_MAX_NODES]; + int concur_list[GGML_MAX_CONCUR]; int concur_list_len; // custom kernels @@ -370,15 +377,15 @@ void ggml_metal_graph_find_concurrency( struct ggml_metal_context * ctx, struct ggml_cgraph * gf) { int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time - int nodes_unused[GGML_MAX_NODES]; + int nodes_unused[GGML_MAX_CONCUR]; - for (int i = 0; i < GGML_MAX_NODES; i++) {ctx->concur_list[i] = 0;} - for (int i = 0; i < gf->n_nodes; i++) {nodes_unused[i] = 1;} + for (int i = 0; i < GGML_MAX_CONCUR; i++) { ctx->concur_list[i] = 0; } + for (int i = 0; i < gf->n_nodes; i++) { nodes_unused[i] = 1; } ctx->concur_list_len = 0; - int n_left = gf->n_nodes; - int n_start = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list - int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos + int n_left = gf->n_nodes; + int n_start = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list + int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos while (n_left > 0) { // number of nodes at a layer (that can be issued concurrently) @@ -386,28 +393,40 @@ void ggml_metal_graph_find_concurrency( for (int i = n_start; i < ((n_start + search_depth > gf->n_nodes) ? gf->n_nodes : n_start + search_depth); i++) { if (nodes_unused[i]) { // if the requirements for gf->nodes[i] are satisfied - int exe_flag=1; + int exe_flag = 1; + // scan all srcs for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) { struct ggml_tensor * src_cur = gf->nodes[i]->src[src_ind]; if (src_cur) { // if is leaf nodes it's satisfied. - if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {continue;} + // TODO: ggml_is_leaf() + if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) { + continue; + } // otherwise this src should be the output from previous nodes. int is_found = 0; + // scan 2*search_depth back because we inserted barrier. - for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) { - if (gf->nodes[ctx->concur_list[j]] == src_cur) {is_found = 1; break;} + //for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) { + for (int j = MAX(0, level_pos - 2*search_depth); j < level_pos; j++) { + if (ctx->concur_list[j] >= 0 && gf->nodes[ctx->concur_list[j]] == src_cur) { + is_found = 1; + break; + } + } + if (is_found == 0) { + exe_flag = 0; + break; } - if (is_found == 0) {exe_flag = 0; break;} } } if (exe_flag) { // check if nodes[i]'s data will be overwritten by a node before nodes[i]. // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3] int64_t data_start = (int64_t) gf->nodes[i]->data; - int64_t length = (int64_t) ggml_nbytes(gf->nodes[i]); + int64_t length = (int64_t) ggml_nbytes(gf->nodes[i]); for (int j = n_start; j < i; j++) { if (nodes_unused[j] && gf->nodes[j]->op != GGML_OP_RESHAPE \ && gf->nodes[j]->op != GGML_OP_VIEW \ @@ -416,9 +435,9 @@ void ggml_metal_graph_find_concurrency( if (((int64_t)gf->nodes[j]->data) >= data_start + length || \ ((int64_t)gf->nodes[j]->data) + (int64_t) ggml_nbytes(gf->nodes[j]) <= data_start) { continue; - } else { - exe_flag = 0; } + + exe_flag = 0; } } } @@ -435,11 +454,13 @@ void ggml_metal_graph_find_concurrency( ctx->concur_list[level_pos + concurrency] = -1; ctx->concur_list_len++; // jump all sorted nodes at nodes_bak - while (!nodes_unused[n_start]) {n_start++;} + while (!nodes_unused[n_start]) { + n_start++; + } level_pos += concurrency + 1; } - if (ctx->concur_list_len > GGML_MAX_NODES) { + if (ctx->concur_list_len > GGML_MAX_CONCUR) { fprintf(stderr, "%s: too many elements for metal ctx->concur_list!\n", __func__); } } @@ -453,7 +474,7 @@ void ggml_metal_graph_compute( // else fallback to serial dispatch MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor; - const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_NODES; + const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_CONCUR; const int n_nodes = has_concur ? ctx->concur_list_len : gf->n_nodes; edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial; From 3d9a55181603e85a26378a850a14068034e5002d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Mon, 7 Aug 2023 10:09:40 +0200 Subject: [PATCH 38/51] Fixed mmap prefetch for GPU offloading (#2529) --- llama-util.h | 2 +- llama.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llama-util.h b/llama-util.h index 3fc03ce28273e..6e9e39ddb6f58 100644 --- a/llama-util.h +++ b/llama-util.h @@ -219,7 +219,7 @@ struct llama_mmap { // prefetch/readahead impairs performance on NUMA systems if (numa) { prefetch = 0; } #ifdef __linux__ - if (prefetch) { flags |= MAP_POPULATE; } + if (prefetch >= file->size) { flags |= MAP_POPULATE; } #endif addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); if (addr == MAP_FAILED) { diff --git a/llama.cpp b/llama.cpp index 839739870eb3e..39aefd499dd0c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -747,12 +747,12 @@ struct llama_model_loader { void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { size_t data_size = 0; - size_t prefetch_size = 0; + size_t prefetch_size = file_loader->file.size; size_t lock_size = 0; for (const llama_load_tensor & lt : tensors_map.tensors) { data_size += lt.size; - if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) { - prefetch_size += lt.size; + if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) { + prefetch_size -= lt.size; } } From 99d29c0094476c4962023036ecd61a3309d0e16b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 7 Aug 2023 13:20:09 +0300 Subject: [PATCH 39/51] ggml : sync (custom ops) (#2537) ggml-ci --- ggml.c | 379 +++++++++++++++++++++++++++++++++++++++++++++------------ ggml.h | 145 +++++++++++++++++----- 2 files changed, 413 insertions(+), 111 deletions(-) diff --git a/ggml.c b/ggml.c index fa0f98aa09df2..b4a36524021f9 100644 --- a/ggml.c +++ b/ggml.c @@ -195,8 +195,8 @@ typedef void * thread_ret_t; #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN) #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr) #else -inline static void* ggml_aligned_malloc(size_t size) { - void* aligned_memory = NULL; +inline static void * ggml_aligned_malloc(size_t size) { + void * aligned_memory = NULL; #ifdef GGML_USE_METAL int result = posix_memalign(&aligned_memory, getpagesize(), size); #else @@ -3811,7 +3811,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59"); +static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -3883,7 +3883,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59"); +static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -4253,7 +4253,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) { tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; } -static inline bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { +bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return @@ -6890,7 +6890,7 @@ GGML_API struct ggml_tensor * ggml_conv_1d( ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0), a->ne[2], 1, 1, }; - struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); int32_t params[] = { s0, p0, d0 }; ggml_set_op_params(result, ¶ms, sizeof(params)); @@ -6905,10 +6905,10 @@ GGML_API struct ggml_tensor * ggml_conv_1d( // ggml_conv_2d -struct ggml_tensor* ggml_conv_2d( - struct ggml_context* ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, +struct ggml_tensor * ggml_conv_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, int s0, int s1, int p0, @@ -6929,7 +6929,7 @@ struct ggml_tensor* ggml_conv_2d( ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1), a->ne[3], b->ne[3], }; - struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); int32_t params[] = { s0, s1, p0, p1, d0, d1 }; ggml_set_op_params(result, ¶ms, sizeof(params)); @@ -6945,7 +6945,7 @@ struct ggml_tensor* ggml_conv_2d( // ggml_conv_1d_ph -struct ggml_tensor* ggml_conv_1d_ph( +struct ggml_tensor * ggml_conv_1d_ph( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, @@ -6963,7 +6963,7 @@ static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) { // ggml_pool_1d -struct ggml_tensor* ggml_pool_1d( +struct ggml_tensor * ggml_pool_1d( struct ggml_context * ctx, struct ggml_tensor * a, enum ggml_op_pool op, @@ -6982,7 +6982,7 @@ struct ggml_tensor* ggml_pool_1d( ggml_calc_pool_output_size(a->ne[0], k0, s0, p0), a->ne[1], }; - struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); int32_t params[] = { op, k0, s0, p0 }; ggml_set_op_params(result, ¶ms, sizeof(params)); @@ -6996,7 +6996,7 @@ struct ggml_tensor* ggml_pool_1d( // ggml_pool_2d -struct ggml_tensor* ggml_pool_2d( +struct ggml_tensor * ggml_pool_2d( struct ggml_context * ctx, struct ggml_tensor * a, enum ggml_op_pool op, @@ -7019,7 +7019,7 @@ struct ggml_tensor* ggml_pool_2d( ggml_calc_pool_output_size(a->ne[1], k1, s1, p1), a->ne[2], }; - struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); int32_t params[] = { op, k0, k1, s0, s1, p0, p1 }; ggml_set_op_params(result, ¶ms, sizeof(params)); @@ -7349,7 +7349,7 @@ struct ggml_tensor * ggml_map_binary_inplace_f32( return ggml_map_binary_impl_f32(ctx, a, b, fun, true); } -// ggml_map_custom1 +// ggml_map_custom1_f32 static struct ggml_tensor * ggml_map_custom1_impl_f32( struct ggml_context * ctx, @@ -7366,7 +7366,7 @@ static struct ggml_tensor * ggml_map_custom1_impl_f32( ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); - result->op = GGML_OP_MAP_CUSTOM1; + result->op = GGML_OP_MAP_CUSTOM1_F32; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; @@ -7387,7 +7387,7 @@ struct ggml_tensor * ggml_map_custom1_inplace_f32( return ggml_map_custom1_impl_f32(ctx, a, fun, true); } -// ggml_map_custom2 +// ggml_map_custom2_f32 static struct ggml_tensor * ggml_map_custom2_impl_f32( struct ggml_context * ctx, @@ -7405,7 +7405,7 @@ static struct ggml_tensor * ggml_map_custom2_impl_f32( ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); - result->op = GGML_OP_MAP_CUSTOM2; + result->op = GGML_OP_MAP_CUSTOM2_F32; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; @@ -7429,7 +7429,7 @@ struct ggml_tensor * ggml_map_custom2_inplace_f32( return ggml_map_custom2_impl_f32(ctx, a, b, fun, true); } -// ggml_map_custom3 +// ggml_map_custom3_f32 static struct ggml_tensor * ggml_map_custom3_impl_f32( struct ggml_context * ctx, @@ -7448,7 +7448,7 @@ static struct ggml_tensor * ggml_map_custom3_impl_f32( ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); - result->op = GGML_OP_MAP_CUSTOM3; + result->op = GGML_OP_MAP_CUSTOM3_F32; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; @@ -7475,6 +7475,190 @@ struct ggml_tensor * ggml_map_custom3_inplace_f32( return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true); } +// ggml_map_custom1 +struct ggml_map_custom1_op_params { + ggml_custom1_op_t fun; + int n_tasks; + void * userdata; +}; + +static struct ggml_tensor * ggml_map_custom1_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_t fun, + int n_tasks, + void * userdata, + bool inplace) { + GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0); + + bool is_node = false; + + if (!inplace && a->grad) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + struct ggml_map_custom1_op_params params = { + /*.fun =*/ fun, + /*.n_tasks =*/ n_tasks, + /*.userdata =*/ userdata + }; + ggml_set_op_params(result, (const void *) ¶ms, sizeof(params)); + + result->op = GGML_OP_MAP_CUSTOM1; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +struct ggml_tensor * ggml_map_custom1( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, false); +} + +struct ggml_tensor * ggml_map_custom1_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, true); +} + +// ggml_map_custom2 + +struct ggml_map_custom2_op_params { + ggml_custom2_op_t fun; + int n_tasks; + void * userdata; +}; + +static struct ggml_tensor * ggml_map_custom2_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_t fun, + int n_tasks, + void * userdata, + bool inplace) { + GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0); + + bool is_node = false; + + if (!inplace && (a->grad || b->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + struct ggml_map_custom2_op_params params = { + /*.fun =*/ fun, + /*.n_tasks =*/ n_tasks, + /*.userdata =*/ userdata + }; + ggml_set_op_params(result, (const void *) ¶ms, sizeof(params)); + + result->op = GGML_OP_MAP_CUSTOM2; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +struct ggml_tensor * ggml_map_custom2( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, false); +} + +struct ggml_tensor * ggml_map_custom2_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, true); +} + +// ggml_map_custom3 + +struct ggml_map_custom3_op_params { + ggml_custom3_op_t fun; + int n_tasks; + void * userdata; +}; + +static struct ggml_tensor * ggml_map_custom3_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_t fun, + int n_tasks, + void * userdata, + bool inplace) { + GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0); + + bool is_node = false; + + if (!inplace && (a->grad || b->grad || c->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + struct ggml_map_custom3_op_params params = { + /*.fun =*/ fun, + /*.n_tasks =*/ n_tasks, + /*.userdata =*/ userdata + }; + ggml_set_op_params(result, (const void *) ¶ms, sizeof(params)); + + result->op = GGML_OP_MAP_CUSTOM3; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + result->src[2] = c; + + return result; +} + +struct ggml_tensor * ggml_map_custom3( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, false); +} + +struct ggml_tensor * ggml_map_custom3_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true); +} + + + // ggml_cross_entropy_loss struct ggml_tensor * ggml_cross_entropy_loss( @@ -9283,8 +9467,8 @@ static void ggml_compute_forward_sum_rows_f32( for (int64_t i3 = 0; i3 < ne03; i3++) { for (int64_t i2 = 0; i2 < ne02; i2++) { for (int64_t i1 = 0; i1 < ne01; i1++) { - float* src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03); - float* dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3); + float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03); + float * dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3); float row_sum = 0; ggml_vec_sum_f32(ne00, &row_sum, src_row); dst_row[0] = row_sum; @@ -12894,7 +13078,7 @@ static void ggml_compute_forward_pool_1d( const struct ggml_tensor * src0, struct ggml_tensor * dst) { - const int32_t* opts = (const int32_t*)dst->op_params; + const int32_t * opts = (const int32_t *)dst->op_params; enum ggml_op_pool op = opts[0]; const int k0 = opts[1]; const int s0 = opts[2]; @@ -14227,24 +14411,6 @@ static void ggml_compute_forward_map_custom1_f32( fun(dst, a); } - -static void ggml_compute_forward_map_custom1( - const struct ggml_compute_params * params, - const struct ggml_tensor * a, - struct ggml_tensor * dst, - const ggml_custom1_op_f32_t fun) { - switch (a->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_map_custom1_f32(params, a, dst, fun); - } break; - default: - { - GGML_ASSERT(false); - } break; - } -} - // ggml_compute_forward_map_custom2 static void ggml_compute_forward_map_custom2_f32( @@ -14263,24 +14429,6 @@ static void ggml_compute_forward_map_custom2_f32( } -static void ggml_compute_forward_map_custom2( - const struct ggml_compute_params * params, - const struct ggml_tensor * a, - const struct ggml_tensor * b, - struct ggml_tensor * dst, - const ggml_custom2_op_f32_t fun) { - switch (a->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_map_custom2_f32(params, a, b, dst, fun); - } break; - default: - { - GGML_ASSERT(false); - } break; - } -} - // ggml_compute_forward_map_custom3 static void ggml_compute_forward_map_custom3_f32( @@ -14299,24 +14447,52 @@ static void ggml_compute_forward_map_custom3_f32( fun(dst, a, b, c); } +// ggml_compute_forward_map_custom1 + +static void ggml_compute_forward_map_custom1( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, + struct ggml_tensor * dst) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) dst->op_params; + + p->fun(dst, a, params->ith, params->nth, p->userdata); +} + +// ggml_compute_forward_map_custom2 + +static void ggml_compute_forward_map_custom2( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, + const struct ggml_tensor * b, + struct ggml_tensor * dst) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) dst->op_params; + + p->fun(dst, a, b, params->ith, params->nth, p->userdata); +} + +// ggml_compute_forward_map_custom3 static void ggml_compute_forward_map_custom3( const struct ggml_compute_params * params, const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, - struct ggml_tensor * dst, - const ggml_custom3_op_f32_t fun) { - switch (a->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_map_custom3_f32(params, a, b, c, dst, fun); - } break; - default: - { - GGML_ASSERT(false); - } break; + struct ggml_tensor * dst) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; } + + struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) dst->op_params; + + p->fun(dst, a, b, c, params->ith, params->nth, p->userdata); } // ggml_compute_forward_cross_entropy_loss @@ -14838,25 +15014,40 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun); } break; - case GGML_OP_MAP_CUSTOM1: + case GGML_OP_MAP_CUSTOM1_F32: { ggml_custom1_op_f32_t fun; memcpy(&fun, tensor->op_params, sizeof(fun)); - ggml_compute_forward_map_custom1(params, tensor->src[0], tensor, fun); + ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun); } break; - case GGML_OP_MAP_CUSTOM2: + case GGML_OP_MAP_CUSTOM2_F32: { ggml_custom2_op_f32_t fun; memcpy(&fun, tensor->op_params, sizeof(fun)); - ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor, fun); + ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun); } break; - case GGML_OP_MAP_CUSTOM3: + case GGML_OP_MAP_CUSTOM3_F32: { ggml_custom3_op_f32_t fun; memcpy(&fun, tensor->op_params, sizeof(fun)); - ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun); + ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun); + } + break; + case GGML_OP_MAP_CUSTOM1: + { + ggml_compute_forward_map_custom1(params, tensor->src[0], tensor); + } + break; + case GGML_OP_MAP_CUSTOM2: + { + ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor); + } + break; + case GGML_OP_MAP_CUSTOM3: + { + ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); } break; case GGML_OP_CROSS_ENTROPY_LOSS: @@ -15664,6 +15855,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: + case GGML_OP_MAP_CUSTOM1_F32: + case GGML_OP_MAP_CUSTOM2_F32: + case GGML_OP_MAP_CUSTOM3_F32: case GGML_OP_MAP_CUSTOM1: case GGML_OP_MAP_CUSTOM2: case GGML_OP_MAP_CUSTOM3: @@ -16449,11 +16643,38 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { case GGML_OP_WIN_UNPART: case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: + case GGML_OP_MAP_CUSTOM1_F32: + case GGML_OP_MAP_CUSTOM2_F32: + case GGML_OP_MAP_CUSTOM3_F32: + { + n_tasks = 1; + } break; case GGML_OP_MAP_CUSTOM1: + { + struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params; + if (p->n_tasks == GGML_N_TASKS_MAX) { + n_tasks = n_threads; + } else { + n_tasks = MIN(p->n_tasks, n_threads); + } + } break; case GGML_OP_MAP_CUSTOM2: + { + struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params; + if (p->n_tasks == GGML_N_TASKS_MAX) { + n_tasks = n_threads; + } else { + n_tasks = MIN(p->n_tasks, n_threads); + } + } break; case GGML_OP_MAP_CUSTOM3: { - n_tasks = 1; + struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params; + if (p->n_tasks == GGML_N_TASKS_MAX) { + n_tasks = n_threads; + } else { + n_tasks = MIN(p->n_tasks, n_threads); + } } break; case GGML_OP_CROSS_ENTROPY_LOSS: { diff --git a/ggml.h b/ggml.h index aba92480c833c..bdbd128004332 100644 --- a/ggml.h +++ b/ggml.h @@ -183,6 +183,15 @@ # define GGML_API #endif +// TODO: support for clang +#ifdef __GNUC__ +# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint))) +#elif defined(_MSC_VER) +# define GGML_DEPRECATED(func, hint) __declspec(deprecated(hint)) func +#else +# define GGML_DEPRECATED(func, hint) func +#endif + #include #include #include @@ -374,6 +383,10 @@ extern "C" { GGML_OP_MAP_UNARY, GGML_OP_MAP_BINARY, + GGML_OP_MAP_CUSTOM1_F32, + GGML_OP_MAP_CUSTOM2_F32, + GGML_OP_MAP_CUSTOM3_F32, + GGML_OP_MAP_CUSTOM1, GGML_OP_MAP_CUSTOM2, GGML_OP_MAP_CUSTOM3, @@ -570,6 +583,8 @@ extern "C" { GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor); GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor); + GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1); + // use this to compute the memory overhead of a tensor GGML_API size_t ggml_tensor_overhead(void); @@ -1240,7 +1255,7 @@ extern "C" { // conv_1d with padding = half // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d) - GGML_API struct ggml_tensor* ggml_conv_1d_ph( + GGML_API struct ggml_tensor * ggml_conv_1d_ph( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, @@ -1253,7 +1268,7 @@ extern "C" { GGML_OP_POOL_COUNT, }; - GGML_API struct ggml_tensor* ggml_pool_1d( + GGML_API struct ggml_tensor * ggml_pool_1d( struct ggml_context * ctx, struct ggml_tensor * a, enum ggml_op_pool op, @@ -1261,7 +1276,7 @@ extern "C" { int s0, // stride int p0); // padding - GGML_API struct ggml_tensor* ggml_pool_2d( + GGML_API struct ggml_tensor * ggml_pool_2d( struct ggml_context * ctx, struct ggml_tensor * a, enum ggml_op_pool op, @@ -1315,15 +1330,6 @@ extern "C" { int h0, int w); - // custom operators - - typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *); - typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *); - - typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *); - typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *); - typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *); - GGML_API struct ggml_tensor * ggml_unary( struct ggml_context * ctx, struct ggml_tensor * a, @@ -1334,63 +1340,138 @@ extern "C" { struct ggml_tensor * a, enum ggml_unary_op op); - GGML_API struct ggml_tensor * ggml_map_unary_f32( + // custom operators + + typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *); + typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *); + + typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *); + typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *); + typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32( struct ggml_context * ctx, struct ggml_tensor * a, - ggml_unary_op_f32_t fun); + ggml_unary_op_f32_t fun), + "use ggml_map_custom1 instead"); - GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32( struct ggml_context * ctx, struct ggml_tensor * a, - ggml_unary_op_f32_t fun); + ggml_unary_op_f32_t fun), + "use ggml_map_custom1_inplace instead"); - GGML_API struct ggml_tensor * ggml_map_binary_f32( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - ggml_binary_op_f32_t fun); + ggml_binary_op_f32_t fun), + "use ggml_map_custom2 instead"); - GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - ggml_binary_op_f32_t fun); + ggml_binary_op_f32_t fun), + "use ggml_map_custom2_inplace instead"); - GGML_API struct ggml_tensor * ggml_map_custom1_f32( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32( struct ggml_context * ctx, struct ggml_tensor * a, - ggml_custom1_op_f32_t fun); + ggml_custom1_op_f32_t fun), + "use ggml_map_custom1 instead"); - GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32( struct ggml_context * ctx, struct ggml_tensor * a, - ggml_custom1_op_f32_t fun); + ggml_custom1_op_f32_t fun), + "use ggml_map_custom1_inplace instead"); - GGML_API struct ggml_tensor * ggml_map_custom2_f32( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - ggml_custom2_op_f32_t fun); + ggml_custom2_op_f32_t fun), + "use ggml_map_custom2 instead"); - GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - ggml_custom2_op_f32_t fun); + ggml_custom2_op_f32_t fun), + "use ggml_map_custom2_inplace instead"); - GGML_API struct ggml_tensor * ggml_map_custom3_f32( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_tensor * c, - ggml_custom3_op_f32_t fun); + ggml_custom3_op_f32_t fun), + "use ggml_map_custom3 instead"); - GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_tensor * c, - ggml_custom3_op_f32_t fun); + ggml_custom3_op_f32_t fun), + "use ggml_map_custom3_inplace instead"); + + // custom operators v2 + + typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata); + typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata); + typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata); + + #define GGML_N_TASKS_MAX -1 + + GGML_API struct ggml_tensor * ggml_map_custom1( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_custom1_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom1_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_custom1_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom2( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_custom2_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom2_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_custom2_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom3( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + ggml_custom3_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom3_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + ggml_custom3_op_t fun, + int n_tasks, + void * userdata); // loss function From 9082b5dfbfae01243a0b822dcd2812877e63bf1b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 7 Aug 2023 13:55:18 +0300 Subject: [PATCH 40/51] ggml : change params pointer (style change) (#2539) ggml-ci --- ggml.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/ggml.c b/ggml.c index b4a36524021f9..9c4b49db86be9 100644 --- a/ggml.c +++ b/ggml.c @@ -4602,7 +4602,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( /*.ne =*/ { 1, 1, 1, 1 }, /*.nb =*/ { 0, 0, 0, 0 }, /*.op =*/ GGML_OP_NONE, - /*.op_params =*/ {0}, + /*.op_params =*/ { 0 }, /*.is_param =*/ false, /*.grad =*/ NULL, /*.src =*/ { NULL }, @@ -4634,6 +4634,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( } static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) { + GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings assert(params_size <= GGML_MAX_OP_PARAMS); memcpy(tensor->op_params, params, params_size); } @@ -6439,7 +6440,7 @@ struct ggml_tensor * ggml_permute( result->src[0] = a; int32_t params[] = { axis0, axis1, axis2, axis3 }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); return result; } @@ -6565,7 +6566,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); int32_t params[] = { n_past, inplace ? 1 : 0 }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_DIAG_MASK_INF; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6605,7 +6606,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); int32_t params[] = { n_past, inplace ? 1 : 0 }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_DIAG_MASK_ZERO; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6721,9 +6722,9 @@ static struct ggml_tensor * ggml_rope_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); int32_t params[6] = { n_past, n_dims, mode, n_ctx }; - memcpy(params + 4, &freq_base, sizeof(float)); + memcpy(params + 4, &freq_base, sizeof(float)); memcpy(params + 5, &freq_scale, sizeof(float)); - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_ROPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6797,7 +6798,7 @@ struct ggml_tensor * ggml_rope_back( struct ggml_tensor * result = ggml_dup_tensor(ctx, a); int32_t params[] = { n_past, n_dims, mode, n_ctx }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_ROPE_BACK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6828,7 +6829,7 @@ struct ggml_tensor * ggml_alibi( int32_t op_params[3] = { n_past, n_head }; memcpy(op_params + 2, &bias_max, sizeof(float)); - ggml_set_op_params(result, &op_params, sizeof(op_params)); + ggml_set_op_params(result, op_params, sizeof(op_params)); result->op = GGML_OP_ALIBI; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6855,7 +6856,7 @@ struct ggml_tensor * ggml_clamp( struct ggml_tensor * result = ggml_view_tensor(ctx, a); float params[] = { min, max }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_CLAMP; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6893,7 +6894,7 @@ GGML_API struct ggml_tensor * ggml_conv_1d( struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); int32_t params[] = { s0, p0, d0 }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_CONV_1D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6932,7 +6933,7 @@ struct ggml_tensor * ggml_conv_2d( struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); int32_t params[] = { s0, s1, p0, p1, d0, d1 }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_CONV_2D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6985,7 +6986,7 @@ struct ggml_tensor * ggml_pool_1d( struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); int32_t params[] = { op, k0, s0, p0 }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_POOL_1D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -7022,7 +7023,7 @@ struct ggml_tensor * ggml_pool_2d( struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); int32_t params[] = { op, k0, k1, s0, s1, p0, p1 }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_POOL_2D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -7190,7 +7191,7 @@ struct ggml_tensor * ggml_win_part( struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); int32_t params[] = { npx, npy, w }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_WIN_PART; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -7220,7 +7221,7 @@ struct ggml_tensor * ggml_win_unpart( struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); int32_t params[] = { w }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_WIN_UNPART; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; From 60baff7c8584ec369e53469cad5f92e102b1efe4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 7 Aug 2023 14:24:42 +0300 Subject: [PATCH 41/51] ggml : pad result of ggml_nbytes() --- ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index 9c4b49db86be9..c97c2b63f346b 100644 --- a/ggml.c +++ b/ggml.c @@ -4110,7 +4110,7 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) { // // is enough, but just in case, adding the second part - return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]); + return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]), GGML_MEM_ALIGN); } size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) { From 93356bdb7a324a8f6570f99d02af392cd4c45796 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 7 Aug 2023 14:25:58 +0300 Subject: [PATCH 42/51] ggml : mul mat tweaks (#2372) * ggml : mul mat wip ggml-ci * ggml : alternative thread distribution for mul_mat ggml-ci * ggml : mul_mat block tiling attempt * ggml : mul_mat threads yield ggml-ci --- ggml.c | 134 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 79 insertions(+), 55 deletions(-) diff --git a/ggml.c b/ggml.c index c97c2b63f346b..beb7f464167d5 100644 --- a/ggml.c +++ b/ggml.c @@ -10731,71 +10731,95 @@ static void ggml_compute_forward_mul_mat( return; } - // parallelize by src0 rows - const int64_t dr = (ne01 + nth - 1)/nth; + const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type]; - const int64_t ir10 = dr*ith; - const int64_t ir11 = MIN(ir10 + dr, ne01); + const int64_t nr0 = ne01; // src0 rows + const int64_t nr1 = ne11*ne12*ne13; // src1 rows - // src1 rows - const int64_t nr1 = ne11*ne12*ne13; + //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1); - const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; - const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type]; + // distribute the thread work across the inner or outer loop based on which one is larger - for (int64_t ir1 = 0; ir1 < nr1; ++ir1) { - const int64_t i13 = (ir1/(ne12*ne11)); - const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11; - const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11); - - const int64_t ir0 = (ir1/ne11)%(ne02*ne03); - const int64_t i03 = (ir0/(ne02)); - // Hack for "Falcon multi-query-attention key stutter" / alternative to ggml_repeat2. - // See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470: - // GG: this is likely the correct way to broadcast, though need some more thought - // therefore leaving the comments to remind us for now - const int64_t i02 = (i12 / (ne12 / ne02)); - // Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon) - // const int64_t i02 = (ir0 - i03*ne02); - - const int64_t i1 = i11; - const int64_t i2 = i12; - const int64_t i3 = i13; - - const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 ); - - // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides - // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using - // the original src1 data pointer, so we should index using the indices directly - // TODO: this is a bit of a hack, we should probably have a better way to handle this - const char * src1_col = (const char *) wdata + - (src1_cont || src1->type != vec_dot_type - ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size - : (i11*nb11 + i12*nb12 + i13*nb13)); - - float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)); - - for (int64_t ir = ir10; ir < ir11; ++ir) { - vec_dot(ne00, &dst_col[ir], src0_row + ir*nb01, src1_col); - } + const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows + const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows + + const int64_t ith0 = ith % nth0; + const int64_t ith1 = ith / nth0; + + const int64_t dr0 = (nr0 + nth0 - 1)/nth0; + const int64_t dr1 = (nr1 + nth1 - 1)/nth1; + + const int64_t ir010 = dr0*ith0; + const int64_t ir011 = MIN(ir010 + dr0, nr0); + + const int64_t ir110 = dr1*ith1; + const int64_t ir111 = MIN(ir110 + dr1, nr1); + + //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111); + + // threads with no work simply yield (not sure if it helps) + if (ir010 >= ir011 || ir110 >= ir111) { + sched_yield(); + return; } - //int64_t t1 = ggml_time_us(); - //static int64_t acc = 0; - //acc += t1 - t0; - //if (t1 - t0 > 10) { - // printf("\n"); - // printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03); - // printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03); - // printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13); + assert(ne12 % ne02 == 0); + assert(ne13 % ne03 == 0); - // printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc); - //} -} + // broadcast factors + const int64_t r2 = ne12/ne02; + const int64_t r3 = ne13/ne03; + // block-tiling attempt + const int64_t blck_0 = 16; + const int64_t blck_1 = 16; -// ggml_compute_forward_out_prod + // attempt to reduce false-sharing (does not seem to make a difference) + float tmp[16]; + + for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) { + for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) { + for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) { + const int64_t i13 = (ir1/(ne12*ne11)); + const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11; + const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11); + + // broadcast src0 into src1 + const int64_t i03 = i13/r3; + const int64_t i02 = i12/r2; + + const int64_t i1 = i11; + const int64_t i2 = i12; + const int64_t i3 = i13; + + const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03); + + // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides + // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using + // the original src1 data pointer, so we should index using the indices directly + // TODO: this is a bit of a hack, we should probably have a better way to handle this + const char * src1_col = (const char *) wdata + + (src1_cont || src1->type != vec_dot_type + ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size + : (i11*nb11 + i12*nb12 + i13*nb13)); + float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)); + + //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { + // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col); + //} + + for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { + vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col); + } + memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float)); + } + } + } +} + +// ggml_compute_forward_out_prod static void ggml_compute_forward_out_prod_f32( const struct ggml_compute_params * params, From f3c3b4b1672d860800639c87d3b5d17564692469 Mon Sep 17 00:00:00 2001 From: klosax <131523366+klosax@users.noreply.github.com> Date: Mon, 7 Aug 2023 19:07:19 +0200 Subject: [PATCH 43/51] Add --rope-scale parameter (#2544) * common.cpp : Add --rope-scale parameter * README.md : Add info about using linear rope scaling --- examples/common.cpp | 11 +++++++++-- examples/main/README.md | 6 ++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index 21f4a0357d422..4d3ba9bb23ab4 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -194,6 +194,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.rope_freq_scale = std::stof(argv[i]); + } else if (arg == "--rope-scale") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.rope_freq_scale = 1.0f/std::stof(argv[i]); } else if (arg == "--memory-f32") { params.memory_f16 = false; } else if (arg == "--top-p") { @@ -564,8 +570,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stdout, " --cfg-negative-prompt PROMPT \n"); fprintf(stdout, " negative prompt to use for guidance. (default: empty)\n"); fprintf(stdout, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); - fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); - fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); + fprintf(stdout, " --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale); + fprintf(stdout, " --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base); + fprintf(stdout, " --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale); fprintf(stdout, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); fprintf(stdout, " --no-penalize-nl do not penalize newline token\n"); fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); diff --git a/examples/main/README.md b/examples/main/README.md index 014112e5db484..55c16096f03b7 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -140,6 +140,12 @@ The `--ctx-size` option allows you to set the size of the prompt context used by - `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results. +### Extended Context Size + +Some fine-tuned models have extened the context length by scaling RoPE. For example, if the original pretrained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8. + +- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model. + ### Keep Prompt The `--keep` option allows users to retain the original prompt when the model runs out of context, ensuring a connection to the initial instruction or conversation topic is maintained. From 2d7baaf50f3277e65cf71071f61ea34823d14c30 Mon Sep 17 00:00:00 2001 From: AustinMroz Date: Tue, 8 Aug 2023 06:44:48 -0500 Subject: [PATCH 44/51] vim : streaming and more (#2495) * Update Vim plugin * Remove getbufoneline usage, Add input bind example. getbufoneline() appears to be a recently added function and has been replaced with getbufline for compatibility. An additional example that explains how to add a keybind that works in insert mode was added. --- examples/llama.vim | 132 +++++++++++++++++++++++++++++++++++++++++++++ examples/llm.vim | 23 -------- 2 files changed, 132 insertions(+), 23 deletions(-) create mode 100644 examples/llama.vim delete mode 100644 examples/llm.vim diff --git a/examples/llama.vim b/examples/llama.vim new file mode 100644 index 0000000000000..f03fadfb7a017 --- /dev/null +++ b/examples/llama.vim @@ -0,0 +1,132 @@ +" Requires an already running llama.cpp server +" To install either copy or symlink to ~/.vim/autoload/llama.vim +" Then start with either :call llama#doLlamaGen(), +" or add a keybind to your vimrc such as +" nnoremap Z :call llama#doLlamaGen() +" Similarly, you could add an insert mode keybind with +" inoremap call llama#doLlamaGen() +" +" g:llama_api_url and g:llama_overrides can be configured in your .vimrc +" let g:llama_api_url = "192.168.1.10:8080" +" llama_overrides can also be set through buffer/window scopes. For instance +" autocmd filetype python let b:llama_overrides = {"temp": 0.2} +" Could be added to your .vimrc to automatically set a lower temperature when +" editing a python script +" Additionally, an override dict can be stored at the top of a file +" !*{"stop": ["User:"]} +" Could be added to the start of your chatlog.txt to set the stopping token +" These parameter dicts are merged together from lowest to highest priority: +" server default -> g:llama_overrides -> w:llama_overrides -> +" b:llama_overrides -> in file (!*) overrides +" +" Sublists (like logit_bias and stop) are overridden, not merged +" Example override: +" !*{"logit_bias": [[13, -5], [2, false]], "temperature": 1, "top_k": 5, "top_p": 0.5, "n_predict": 256, "repeat_last_n": 256, "repeat_penalty": 1.17647} +if !exists("g:llama_api_url") + let g:llama_api_url= "127.0.0.1:8080" +endif +if !exists("g:llama_overrides") + let g:llama_overrides = {} +endif +const s:querydata = {"n_predict": 256, "stop": [ "\n" ], "stream": v:true } +const s:curlcommand = ['curl','--data-raw', "{\"prompt\":\"### System:\"}", '--silent', '--no-buffer', '--request', 'POST', '--url', g:llama_api_url .. '/completion', '--header', "Content-Type: application/json"] +let s:linedict = {} + +func s:callbackHandler(bufn, channel, msg) + if len(a:msg) < 3 + return + elseif a:msg[0] == "d" + let l:msg = a:msg[6:-1] + else + let l:msg = a:msg + endif + let l:decoded_msg = json_decode(l:msg) + let l:newtext = split(l:decoded_msg['content'], "\n", 1) + if len(l:newtext) > 0 + call setbufline(a:bufn, s:linedict[a:bufn], getbufline(a:bufn, s:linedict[a:bufn])[0] .. newtext[0]) + else + echo "nothing genned" + endif + if len(newtext) > 1 + let l:failed = appendbufline(a:bufn, s:linedict[a:bufn], newtext[1:-1]) + let s:linedict[a:bufn] = s:linedict[a:bufn] + len(newtext)-1 + endif + if has_key(l:decoded_msg, "stop") && l:decoded_msg.stop + echo "Finished generation" + endif +endfunction + +func llama#doLlamaGen() + if exists("b:job") + if job_status(b:job) == "run" + call job_stop(b:job) + return + endif + endif + + let l:cbuffer = bufnr("%") + let s:linedict[l:cbuffer] = line('$') + let l:buflines = getbufline(l:cbuffer, 1, 1000) + let l:querydata = copy(s:querydata) + call extend(l:querydata, g:llama_overrides) + if exists("w:llama_overrides") + call extend(l:querydata, w:llama_overrides) + endif + if exists("b:llama_overrides") + call extend(l:querydata, b:llama_overrides) + endif + if l:buflines[0][0:1] == '!*' + let l:userdata = json_decode(l:buflines[0][2:-1]) + call extend(l:querydata, l:userdata) + let l:buflines = l:buflines[1:-1] + endif + let l:querydata.prompt = join(l:buflines, "\n") + let l:curlcommand = copy(s:curlcommand) + let l:curlcommand[2] = json_encode(l:querydata) + let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])}) +endfunction + +" Echos the tokkenization of the provided string , or cursor to end of word +" Onus is placed on the user to include the preceding space +func llama#tokenizeWord(...) + if (a:0 > 0) + let l:input = a:1 + else + exe "normal \"*ye" + let l:input = @* + endif + let l:querydata = {"content": l:input} + let l:curlcommand = copy(s:curlcommand) + let l:curlcommand[2] = json_encode(l:querydata) + let l:curlcommand[8] = g:llama_api_url .. "/tokenize" + let s:token_job = job_start(l:curlcommand, {"callback": function("s:tokenizeWordCallback", [l:input])}) +endfunction + +func s:tokenizeWordCallback(plaintext, channel, msg) + echo '"' .. a:plaintext ..'" - ' .. string(json_decode(a:msg).tokens) +endfunction + + +" Echos the token count of the entire buffer (or provided string) +" Example usage :echo llama#tokenCount() +func llama#tokenCount(...) + if (a:0 > 0) + let l:buflines = a:1 + else + let l:buflines = getline(1,1000) + if l:buflines[0][0:1] == '!*' + let l:buflines = l:buflines[1:-1] + endif + let l:buflines = join(l:buflines, "\n") + endif + let l:querydata = {"content": l:buflines} + let l:curlcommand = copy(s:curlcommand) + let l:curlcommand[2] = json_encode(l:querydata) + let l:curlcommand[8] = g:llama_api_url .. "/tokenize" + let s:token_job = job_start(l:curlcommand, {"callback": "s:tokenCountCallback"}) +endfunction + +func s:tokenCountCallback(channel, msg) + let resp = json_decode(a:msg) + echo len(resp.tokens) +endfunction diff --git a/examples/llm.vim b/examples/llm.vim deleted file mode 100644 index efecad0cd89f1..0000000000000 --- a/examples/llm.vim +++ /dev/null @@ -1,23 +0,0 @@ -function! Llm() - - let url = "http://127.0.0.1:8080/completion" - - " Get the content of the current buffer - let buffer_content = join(getline(1, '$'), "\n") - - " Create the JSON payload - let json_payload = {"temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":10,"stream": v:false} - let json_payload.prompt = buffer_content - - " Define the curl command - let curl_command = 'curl -k -s -X POST -H "Content-Type: application/json" -d @- ' . url - let response = system(curl_command, json_encode(json_payload)) - - " Extract the content field from the response - let content = json_decode(response).content - - " Insert the content at the cursor position - call setline(line('.'), getline('.') . content) -endfunction - -command! Llm call Llm() From e7f94d6fdc83b41ba449b4b8c80821673dd12ffc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 8 Aug 2023 15:05:30 +0300 Subject: [PATCH 45/51] vim : bring back simple llm.vim example --- examples/llm.vim | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 examples/llm.vim diff --git a/examples/llm.vim b/examples/llm.vim new file mode 100644 index 0000000000000..473e0077ad4e9 --- /dev/null +++ b/examples/llm.vim @@ -0,0 +1,25 @@ +" Basic plugin example + +function! Llm() + + let url = "http://127.0.0.1:8080/completion" + + " Get the content of the current buffer + let buffer_content = join(getline(1, '$'), "\n") + + " Create the JSON payload + let json_payload = {"temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":10,"stream": v:false} + let json_payload.prompt = buffer_content + + " Define the curl command + let curl_command = 'curl -k -s -X POST -H "Content-Type: application/json" -d @- ' . url + let response = system(curl_command, json_encode(json_payload)) + + " Extract the content field from the response + let content = json_decode(response).content + + " Insert the content at the cursor position + call setline(line('.'), getline('.') . content) +endfunction + +command! Llm call Llm() From 7ed8d1fe7f8cbe6a6763e6b46759795ac8d21e12 Mon Sep 17 00:00:00 2001 From: chaihahaha Date: Tue, 8 Aug 2023 20:07:02 +0800 Subject: [PATCH 46/51] llm.vim : multiline autocompletion, get rid of "^@" (#2543) --- examples/llm.vim | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/llm.vim b/examples/llm.vim index 473e0077ad4e9..594a285493dcc 100644 --- a/examples/llm.vim +++ b/examples/llm.vim @@ -18,8 +18,10 @@ function! Llm() " Extract the content field from the response let content = json_decode(response).content + let split_newlines = split(content, '\n', 1) + " Insert the content at the cursor position - call setline(line('.'), getline('.') . content) + call setline(line('.'), [ getline('.') . split_newlines[0] ] + split_newlines[1:]) endfunction command! Llm call Llm() From acfc5478ff3446ca3b54553967a3dea09b7c771a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Tue, 8 Aug 2023 14:38:16 +0200 Subject: [PATCH 47/51] CUDA: tighter VRAM scratch size for 65b/70b (#2551) --- llama.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llama.cpp b/llama.cpp index 39aefd499dd0c..71061aab910ef 100644 --- a/llama.cpp +++ b/llama.cpp @@ -149,7 +149,7 @@ static const std::map & MEM_REQ_EVAL() } // amount of VRAM needed per batch size to hold temporary results -// the values for 3b and 65b are not derived from testing but instead chosen conservatively +// the values for 3b are not derived from testing but instead chosen conservatively static const std::map & VRAM_REQ_SCRATCH_BASE() { static std::map k_sizes = { @@ -157,14 +157,14 @@ static const std::map & VRAM_REQ_SCRATCH_BASE() { MODEL_7B, 512ull * kB }, { MODEL_13B, 640ull * kB }, { MODEL_30B, 768ull * kB }, - { MODEL_65B, 1536ull * kB }, - { MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced) + { MODEL_65B, 1280ull * kB }, + { MODEL_70B, 1280ull * kB }, }; return k_sizes; } // amount of VRAM needed per batch size and context to hold temporary results -// the values for 3b and 65b are not derived from testing but instead chosen conservatively +// the values for 3b are not derived from testing but instead chosen conservatively static const std::map & VRAM_REQ_SCRATCH_PER_CONTEXT() { static std::map k_sizes = { @@ -172,8 +172,8 @@ static const std::map & VRAM_REQ_SCRATCH_PER_CONTEXT() { MODEL_7B, 128ull }, { MODEL_13B, 160ull }, { MODEL_30B, 208ull }, - { MODEL_65B, 416ull }, - { MODEL_70B, 416ull }, // TODO (likely can be reduced) + { MODEL_65B, 256ull }, + { MODEL_70B, 256ull }, }; return k_sizes; } From f5bfea0580e417f99850d5456ca541d871a3e48c Mon Sep 17 00:00:00 2001 From: Martin Krasser Date: Tue, 8 Aug 2023 15:29:19 +0200 Subject: [PATCH 48/51] Allow passing grammar to completion endpoint (#2532) * Allow passing grammar to completion endpoint --- Makefile | 2 +- examples/server/README.md | 2 ++ examples/server/server.cpp | 60 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 61 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 897c5cb9abcca..32598edfe847d 100644 --- a/Makefile +++ b/Makefile @@ -380,7 +380,7 @@ embedding: examples/embedding/embedding.cpp build-info.h ggml. save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o $(OBJS) +server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS) diff --git a/examples/server/README.md b/examples/server/README.md index aee31ae42e517..e56ca063a9f0e 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -151,6 +151,8 @@ node . `mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1). + `grammar`: Set grammar for grammar-based sampling (default: no grammar) + `seed`: Set the random number generator (RNG) seed (default: -1, -1 = random seed). `ignore_eos`: Ignore end of stream token and continue generating (default: false). diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 6f7a66da108c8..10ae264f516f4 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1,6 +1,7 @@ #include "common.h" #include "llama.h" #include "build-info.h" +#include "grammar-parser.h" #ifndef NDEBUG // crash the server in debug mode, otherwise send an http 500 error @@ -195,6 +196,8 @@ struct llama_server_context llama_context *ctx = nullptr; gpt_params params; + llama_grammar *grammar = nullptr; + bool truncated = false; bool stopped_eos = false; bool stopped_word = false; @@ -226,6 +229,7 @@ struct llama_server_context void rewind() { params.antiprompt.clear(); + params.grammar.clear(); num_prompt_tokens = 0; num_tokens_predicted = 0; generated_text = ""; @@ -237,6 +241,7 @@ struct llama_server_context stopped_limit = false; stopping_word = ""; multibyte_pending = 0; + grammar = nullptr; n_remain = 0; n_past = 0; @@ -257,6 +262,33 @@ struct llama_server_context return true; } + bool loadGrammar() + { + if (!params.grammar.empty()) { + grammar_parser::parse_state parsed_grammar; + + parsed_grammar = grammar_parser::parse(params.grammar.c_str()); + // will be empty (default) if there are parse errors + if (parsed_grammar.rules.empty()) { + LOG_ERROR("grammar parse error", {{"grammar", params.grammar}}); + return false; + } + grammar_parser::print_grammar(stderr, parsed_grammar); + + { + auto it = params.logit_bias.find(llama_token_eos()); + if (it != params.logit_bias.end() && it->second == -INFINITY) { + LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {}); + } + } + + std::vector grammar_rules(parsed_grammar.c_rules()); + grammar = llama_grammar_init( + grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); + } + return true; + } + void loadPrompt() { params.prompt.insert(0, 1, ' '); // always add a first space @@ -420,6 +452,10 @@ struct llama_server_context logits[llama_token_nl()] = nl_logit; } + if (grammar != nullptr) { + llama_sample_grammar(ctx, &candidates_p, grammar); + } + if (temp <= 0) { // Greedy sampling @@ -457,10 +493,15 @@ struct llama_server_context } } + if (grammar != nullptr) { + llama_grammar_accept_token(ctx, grammar, result.tok); + } + for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i) { result.probs.push_back({candidates_p.data[i].id, candidates_p.data[i].p}); } + last_n_tokens.erase(last_n_tokens.begin()); last_n_tokens.push_back(result.tok); num_tokens_predicted++; @@ -947,6 +988,7 @@ static json format_generation_settings(llama_server_context &llama) {"stream", llama.stream}, {"logit_bias", llama.params.logit_bias}, {"n_probs", llama.params.n_probs}, + {"grammar", llama.params.grammar}, }; } @@ -1048,6 +1090,7 @@ static void parse_options_completion(const json &body, llama_server_context &lla llama.params.n_keep = body.value("n_keep", default_params.n_keep); llama.params.seed = body.value("seed", default_params.seed); llama.params.prompt = body.value("prompt", default_params.prompt); + llama.params.grammar = body.value("grammar", default_params.grammar); llama.params.n_probs = body.value("n_probs", default_params.n_probs); llama.params.logit_bias.clear(); @@ -1179,6 +1222,12 @@ int main(int argc, char **argv) parse_options_completion(json::parse(req.body), llama); + if (!llama.loadGrammar()) + { + res.status = 400; + return; + } + llama.loadPrompt(); llama.beginCompletion(); @@ -1334,8 +1383,12 @@ int main(int argc, char **argv) svr.set_error_handler([](const Request &, Response &res) { - res.set_content("File Not Found", "text/plain"); - res.status = 404; }); + if (res.status == 400) { + res.set_content("Invalid request", "text/plain"); + } else { + res.set_content("File Not Found", "text/plain"); + res.status = 404; + } }); // set timeouts and change hostname and port svr.set_read_timeout(sparams.read_timeout); @@ -1363,6 +1416,9 @@ int main(int argc, char **argv) return 1; } + if (llama.grammar != nullptr) { + llama_grammar_free(llama.grammar); + } llama_backend_free(); return 0; From 4024f91a665d83b6de8658d45ec9d004c5d90c79 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Wed, 9 Aug 2023 01:56:44 +0300 Subject: [PATCH 49/51] Add intrinsics polyfills for AMD --------- Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com> Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com> Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com> --- CMakeLists.txt | 1 - Makefile | 1 - ggml-cuda.cu | 53 ++++++++++++++++++++++++++++++++++++-------------- 3 files changed, 38 insertions(+), 17 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0da4db55820df..5d64cf77001ea 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -379,7 +379,6 @@ if (LLAMA_HIPBLAS) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) - target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV) set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas) diff --git a/Makefile b/Makefile index 450e055fd947f..c3ef75f3fdea0 100644 --- a/Makefile +++ b/Makefile @@ -302,7 +302,6 @@ ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y) -ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_FORCE_DMMV ggml-cuda.o: CXXFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) ggml-cuda.o: ggml-cuda.cu ggml-cuda.h $(CXX) $(CXXFLAGS) -x hip -c -o $@ $< diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 033df63349704..96e558e4cbbab 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -75,6 +75,29 @@ #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products +#if defined(GGML_USE_HIPBLAS) +#define __CUDA_ARCH__ 1300 + +typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); +static __device__ __forceinline__ int __vsubss4(const int a, const int b) { + const int8x4_t va = reinterpret_cast(a); + const int8x4_t vb = reinterpret_cast(b); + const int8x4_t c = __builtin_elementwise_sub_sat(va, vb); + return reinterpret_cast(c); +} + +static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { +#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) + c = __builtin_amdgcn_sdot4(a, b, c, false); +#else + const int8x4_t va = reinterpret_cast(a); + const int8x4_t vb = reinterpret_cast(b); + c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3]; +#endif + return c; +} +#endif + #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data #endif @@ -1396,8 +1419,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest return; } - y[ib].ds.x = d; - y[ib].ds.y = sum; + reinterpret_cast(y[ib].ds.x) = d; + reinterpret_cast(y[ib].ds.y) = sum; } template @@ -1609,8 +1632,8 @@ template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp #else const float2 dm8f = __half22float2(dm8); const float2 ds8f = __half22float2(ds8); - const float d8d8 = dm8.x * ds8.x; - const float m8s8 = dm8.y * ds8.y; + const float d8d8 = __low2float(dm8) * __low2float(ds8); + const float m8s8 = __high2float(dm8) * __high2float(ds8); #endif // GGML_CUDA_F16 // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it @@ -2380,7 +2403,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1( u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); } - return vec_dot_q8_0_q8_1_impl(v, u, bq8_0->d, bq8_1->ds.x); + return vec_dot_q8_0_q8_1_impl(v, u, bq8_0->d, __low2half(bq8_1->ds)); } static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { @@ -2478,7 +2501,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1( #pragma unroll for (int i = 0; i < QR2_K; ++ i) { u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); - d8[i] = bq8_1[bq8_offset + i].ds.x; + d8[i] = __low2half(bq8_1[bq8_offset + i].ds); } return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8); @@ -2605,7 +2628,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1( #pragma unroll for (int i = 0; i < QR3_K; ++i) { u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); - d8[i] = bq8_1[bq8_offset + i].ds.x; + d8[i] = __low2half(bq8_1[bq8_offset + i].ds); } return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8); @@ -2782,7 +2805,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( for (int i = 0; i < QR4_K; ++i) { const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; - d8[i] = bq8i->ds.x; + d8[i] = __low2half(bq8i->ds); const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); u[2*i+0] = q8[0]; @@ -2809,8 +2832,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( const float dall = bq4_K->d[0]; const float dmin = bq4_K->d[1]; - const float d8_1 = bq8_1[0].ds.x; - const float d8_2 = bq8_1[1].ds.x; + const float d8_1 = __low2float(bq8_1[0].ds); + const float d8_2 = __low2float(bq8_1[1].ds); const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); @@ -2977,7 +3000,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( #pragma unroll for (int i = 0; i < QR5_K; ++i) { const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; - d8[i] = bq8i->ds.x; + d8[i] = __low2float(bq8i->ds); const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); u[2*i+0] = q8[0]; @@ -2995,8 +3018,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( const float d = bq5_K->d; - const float d8_1 = bq8_1[0].ds.x; - const float d8_2 = bq8_1[1].ds.x; + const float d8_1 = __low2half(bq8_1[0].ds); + const float d8_2 = __low2half(bq8_1[1].ds); const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); @@ -3157,7 +3180,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1( #pragma unroll for (int i = 0; i < QR6_K; ++i) { u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1); - d8[i] = bq8_1[bq8_offset + 2*i].ds.x; + d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds); } return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8); @@ -3336,7 +3359,7 @@ static __global__ void mul_mat_q( *dsi_dst = *dsi_src; } else { float * dfi_dst = (float *) dsi_dst; - *dfi_dst = (*dsi_src).x; + *dfi_dst = __low2half(*dsi_src); } } From 25d43e0eb578b6e73046d9d6644a3a14d460600d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 9 Aug 2023 09:42:34 +0200 Subject: [PATCH 50/51] CUDA: tuned mul_mat_q kernels (#2546) --- Makefile | 5 - README.md | 1 - ggml-cuda.cu | 1068 ++++++++++++++++++++++++++++++++------------------ 3 files changed, 682 insertions(+), 392 deletions(-) diff --git a/Makefile b/Makefile index 32598edfe847d..f01bf0c8324ed 100644 --- a/Makefile +++ b/Makefile @@ -253,11 +253,6 @@ ifdef LLAMA_CUDA_KQUANTS_ITER else NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2 endif -ifdef LLAMA_CUDA_MMQ_Y - NVCCFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y) -else - NVCCFLAGS += -DGGML_CUDA_MMQ_Y=64 -endif # LLAMA_CUDA_MMQ_Y #ifdef LLAMA_CUDA_CUBLAS # NVCCFLAGS += -DGGML_CUDA_CUBLAS #endif # LLAMA_CUDA_CUBLAS diff --git a/README.md b/README.md index 2ece294b7c947..6900b1152e736 100644 --- a/README.md +++ b/README.md @@ -406,7 +406,6 @@ Building the program with BLAS support may lead to some performance improvements ---> | Option | Legal values | Default | Description | |-------------------------|------------------------|---------|-------------| - | LLAMA_CUDA_MMQ_Y | Positive integer >= 32 | 64 | Tile size in y direction when using the custom CUDA kernels for prompt processing. Higher values can be faster depending on the amount of shared memory available. Power of 2 heavily recommended. | | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. | | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 9d42efb0d0b03..6390b1158b6a6 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -14,6 +14,7 @@ #include "ggml.h" #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products +#define CC_TURING 700 #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -262,10 +263,6 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_ #define CUDA_QUANTIZE_BLOCK_SIZE 256 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256 -#ifndef GGML_CUDA_MMQ_Y -#define GGML_CUDA_MMQ_Y 64 -#endif // GGML_CUDA_MMQ_Y - // dmmv = dequantize_mul_mat_vec #ifndef GGML_CUDA_DMMV_X #define GGML_CUDA_DMMV_X 32 @@ -285,6 +282,20 @@ struct ggml_tensor_extra_gpu { cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs }; +static int g_device_count = -1; +static int g_main_device = 0; +static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES]; +static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; +static bool g_mul_mat_q = false; + +static void * g_scratch_buffer = nullptr; +static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default +static size_t g_scratch_offset = 0; + +static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; + +static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr }; + static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) { const int i = blockDim.x*blockIdx.x + threadIdx.x; @@ -1549,8 +1560,8 @@ template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp #else const float2 dm8f = __half22float2(dm8); const float2 ds8f = __half22float2(ds8); - const float d8d8 = dm8.x * ds8.x; - const float m8s8 = dm8.y * ds8.y; + const float d8d8 = dm8f.x * ds8f.x; + const float m8s8 = dm8f.y * ds8f.y; #endif // GGML_CUDA_F16 // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it @@ -1884,21 +1895,21 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1( return vec_dot_q4_0_q8_1_impl(v, u, bq4_0->d, bq8_1->ds); } -static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_0) + GGML_CUDA_MMQ_Y/QI4_0]; + __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0]; *x_ql = tile_x_qs; *x_dm = (half2 *) tile_x_d; } -template static __device__ __forceinline__ void load_tiles_q4_0( +template static __device__ __forceinline__ void load_tiles_q4_0( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -1910,7 +1921,7 @@ template static __device__ __forceinline__ void load_tiles_q4_ float * x_dmf = (float *) x_dm; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -1920,39 +1931,30 @@ template static __device__ __forceinline__ void load_tiles_q4_ const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx; x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); - x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d; + // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d; } -// const int blocks_per_tile_x_row = WARP_SIZE / QI4_0; -// const int kbxd = k % blocks_per_tile_x_row; + const int blocks_per_tile_x_row = WARP_SIZE / QI4_0; + const int kbxd = k % blocks_per_tile_x_row; -// #pragma unroll -// for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_0) { -// FIXME out-of-bounds -// const int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row; +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) { + int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row; -// if (i >= GGML_CUDA_MMQ_Y) { -// return; -// } + if (need_check) { + i = min(i, i_max); + } -// const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd; + const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd; -// x_dm[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd].x = bxi->d; -// } + x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d; + } } static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q4_0_Q8_1_MMQ == 0); - const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); const float * x_dmf = (float *) x_dm; @@ -1960,13 +1962,13 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat( #pragma unroll for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l]; - u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_0]; + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE]; } return vec_dot_q4_0_q8_1_impl (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0], - y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); + y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); } static __device__ __forceinline__ float vec_dot_q4_1_q8_1( @@ -1987,21 +1989,21 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1( return vec_dot_q4_1_q8_1_impl(v, u, bq4_1->dm, bq8_1->ds); } -static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_1) + GGML_CUDA_MMQ_Y/QI4_1]; + __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1]; *x_ql = tile_x_qs; *x_dm = tile_x_dm; } -template static __device__ __forceinline__ void load_tiles_q4_1( +template static __device__ __forceinline__ void load_tiles_q4_1( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -2011,7 +2013,7 @@ template static __device__ __forceinline__ void load_tiles_q4_ const block_q4_1 * bx0 = (block_q4_1 *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -2027,7 +2029,7 @@ template static __device__ __forceinline__ void load_tiles_q4_ const int kbxd = k % blocks_per_tile_x_row; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_1) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) { int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row; if (need_check) { @@ -2044,27 +2046,19 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q4_1_Q8_1_MMQ == 0); - const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); int u[2*VDR_Q4_1_Q8_1_MMQ]; #pragma unroll for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l]; - u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_1]; + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE]; } return vec_dot_q4_1_q8_1_impl (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1], - y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); + y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); } static __device__ __forceinline__ float vec_dot_q5_0_q8_1( @@ -2087,21 +2081,21 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1( return vec_dot_q5_0_q8_1_impl(vl, vh, u, bq5_0->d, bq8_1->ds); } -static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_0) + GGML_CUDA_MMQ_Y/QI5_0]; + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; + __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0]; *x_ql = tile_x_ql; *x_dm = (half2 *) tile_x_d; } -template static __device__ __forceinline__ void load_tiles_q5_0( +template static __device__ __forceinline__ void load_tiles_q5_0( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -2111,7 +2105,7 @@ template static __device__ __forceinline__ void load_tiles_q5_ const block_q5_0 * bx0 = (block_q5_0 *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -2147,7 +2141,7 @@ template static __device__ __forceinline__ void load_tiles_q5_ float * x_dmf = (float *) x_dm; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_0) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) { int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row; if (need_check) { @@ -2164,14 +2158,6 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q5_0_Q8_1_MMQ == 0); - const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0; const float * x_dmf = (const float *) x_dm; @@ -2181,12 +2167,12 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat( #pragma unroll for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l]; - u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_0]; + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE]; } return vec_dot_q8_0_q8_1_impl - (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); + (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); } static __device__ __forceinline__ float vec_dot_q5_1_q8_1( @@ -2209,21 +2195,21 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1( return vec_dot_q5_1_q8_1_impl(vl, vh, u, bq5_1->dm, bq8_1->ds); } -static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_1) + GGML_CUDA_MMQ_Y/QI5_1]; + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; } -template static __device__ __forceinline__ void load_tiles_q5_1( +template static __device__ __forceinline__ void load_tiles_q5_1( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -2233,7 +2219,7 @@ template static __device__ __forceinline__ void load_tiles_q5_ const block_q5_1 * bx0 = (block_q5_1 *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -2266,7 +2252,7 @@ template static __device__ __forceinline__ void load_tiles_q5_ const int kbxd = k % blocks_per_tile_x_row; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_1) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) { int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row; if (need_check) { @@ -2283,14 +2269,6 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q5_1_Q8_1_MMQ == 0); - const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1; @@ -2298,12 +2276,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat( #pragma unroll for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l]; - u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_1]; + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE]; } return vec_dot_q8_1_q8_1_impl - (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); + (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); } static __device__ __forceinline__ float vec_dot_q8_0_q8_1( @@ -2323,21 +2301,21 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1( return vec_dot_q8_0_q8_1_impl(v, u, bq8_0->d, bq8_1->ds.x); } -static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI8_0) + GGML_CUDA_MMQ_Y/QI8_0]; + __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0]; *x_ql = tile_x_qs; *x_dm = (half2 *) tile_x_d; } -template static __device__ __forceinline__ void load_tiles_q8_0( +template static __device__ __forceinline__ void load_tiles_q8_0( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -2348,7 +2326,7 @@ template static __device__ __forceinline__ void load_tiles_q8_ const block_q8_0 * bx0 = (block_q8_0 *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -2358,41 +2336,29 @@ template static __device__ __forceinline__ void load_tiles_q8_ const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx; x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx); - x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbx] = bxi->d; } -// const int blocks_per_tile_x_row = WARP_SIZE / QI8_0; -// const int kbxd = k % blocks_per_tile_x_row; + const int blocks_per_tile_x_row = WARP_SIZE / QI8_0; + const int kbxd = k % blocks_per_tile_x_row; -// #pragma unroll -// for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI8_0) { -// FIXME out-of-bounds -// const int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row; +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) { + int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row; -// #if GGML_CUDA_MMQ_Y < 64 -// if (i >= GGML_CUDA_MMQ_Y) { -// return; -// } -// #endif // GGML_CUDA_MMQ_Y < 64 + if (need_check) { + i = min(i, i_max); + } -// const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd; + const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd; -// x_dm[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd].x = bxi->d; -// } + x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d; + } } static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q8_0_Q8_1_MMQ == 0); - const float * x_dmf = (const float *) x_dm; const float * y_df = (const float *) y_ds; @@ -2424,23 +2390,23 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1( return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8); } -static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI2_K) + GGML_CUDA_MMQ_Y/QI2_K]; - __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4]; + __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; *x_sc = tile_x_sc; } -template static __device__ __forceinline__ void load_tiles_q2_K( +template static __device__ __forceinline__ void load_tiles_q2_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -2450,7 +2416,7 @@ template static __device__ __forceinline__ void load_tiles_q2_ const block_q2_K * bx0 = (block_q2_K *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -2466,8 +2432,8 @@ template static __device__ __forceinline__ void load_tiles_q2_ const int kbxd = k % blocks_per_tile_x_row; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI2_K) { - int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) { + int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y; if (need_check) { i = min(i, i_max); @@ -2479,7 +2445,7 @@ template static __device__ __forceinline__ void load_tiles_q2_ } #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); if (need_check) { @@ -2496,14 +2462,6 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q2_K_Q8_1_MMQ == 0); - const int kbx = k / QI2_K; const int ky = (k % QI2_K) * QR2_K; const float * y_df = (const float *) y_ds; @@ -2520,7 +2478,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat( const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4; - const int index_y = j * (QR2_K*WARP_SIZE) + QR2_K*k; + const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE; return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]); } @@ -2551,12 +2509,12 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1( return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8); } -static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI3_K) + GGML_CUDA_MMQ_Y/QI3_K]; - __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2) + GGML_CUDA_MMQ_Y/2]; - __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4]; + __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K]; + __shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; @@ -2564,12 +2522,12 @@ static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** *x_sc = tile_x_sc; } -template static __device__ __forceinline__ void load_tiles_q3_K( +template static __device__ __forceinline__ void load_tiles_q3_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -2579,7 +2537,7 @@ template static __device__ __forceinline__ void load_tiles_q3_ const block_q3_K * bx0 = (block_q3_K *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -2596,8 +2554,8 @@ template static __device__ __forceinline__ void load_tiles_q3_ float * x_dmf = (float *) x_dm; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI3_K) { - int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) { + int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y; if (need_check) { i = min(i, i_max); @@ -2609,7 +2567,7 @@ template static __device__ __forceinline__ void load_tiles_q3_ } #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 2) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) { int i = i0 + i_offset * 2 + k / (WARP_SIZE/2); if (need_check) { @@ -2623,7 +2581,7 @@ template static __device__ __forceinline__ void load_tiles_q3_ } #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); if (need_check) { @@ -2652,14 +2610,6 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q3_K_Q8_1_MMQ == 0); - const int kbx = k / QI3_K; const int ky = (k % QI3_K) * QR3_K; const float * x_dmf = (const float *) x_dm; @@ -2681,7 +2631,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat( v[l] = __vsubss4(vll, vlh); } - const int index_y = j * (QR3_K*WARP_SIZE) + k*QR3_K; + const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE; return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]); } @@ -2778,23 +2728,23 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( #endif } -static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_K) + GGML_CUDA_MMQ_Y/QI4_K]; - __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8]; + __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; *x_sc = tile_x_sc; } -template static __device__ __forceinline__ void load_tiles_q4_K( +template static __device__ __forceinline__ void load_tiles_q4_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -2804,7 +2754,7 @@ template static __device__ __forceinline__ void load_tiles_q4_ const block_q4_K * bx0 = (block_q4_K *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -2820,8 +2770,8 @@ template static __device__ __forceinline__ void load_tiles_q4_ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_K) { - int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) { + int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y; if (need_check) { i = min(i, i_max); @@ -2833,8 +2783,8 @@ template static __device__ __forceinline__ void load_tiles_q4_ } #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) { - int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; if (need_check) { i = min(i, i_max); @@ -2858,14 +2808,6 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q4_K_Q8_1_MMQ == 0); - int v[QR4_K*VDR_Q4_K_Q8_1_MMQ]; #pragma unroll @@ -2876,7 +2818,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat( const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8); - const int index_y = j * (QR4_K*WARP_SIZE) + QR4_K*k; + const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE; return vec_dot_q4_K_q8_1_impl_mmq(v, &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]); } @@ -2969,23 +2911,23 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( #endif } -static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_K) + GGML_CUDA_MMQ_Y/QI5_K]; - __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8]; + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; *x_sc = tile_x_sc; } -template static __device__ __forceinline__ void load_tiles_q5_K( +template static __device__ __forceinline__ void load_tiles_q5_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -2995,7 +2937,7 @@ template static __device__ __forceinline__ void load_tiles_q5_ const block_q5_K * bx0 = (block_q5_K *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -3024,8 +2966,8 @@ template static __device__ __forceinline__ void load_tiles_q5_ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_K) { - int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) { + int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y; if (need_check) { i = min(i, i_max); @@ -3037,8 +2979,8 @@ template static __device__ __forceinline__ void load_tiles_q5_ } #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) { - int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; if (need_check) { i = min(i, i_max); @@ -3062,18 +3004,10 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q5_K_Q8_1_MMQ == 0); - const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8); - const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k; - const int index_y = j * (QR5_K*WARP_SIZE) + QR5_K*k; + const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k; + const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE; return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]); } @@ -3103,23 +3037,23 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1( return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8); } -static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI6_K) + GGML_CUDA_MMQ_Y/QI6_K]; - __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8]; + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; *x_sc = tile_x_sc; } -template static __device__ __forceinline__ void load_tiles_q6_K( +template static __device__ __forceinline__ void load_tiles_q6_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -3129,7 +3063,7 @@ template static __device__ __forceinline__ void load_tiles_q6_ const block_q6_K * bx0 = (block_q6_K *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -3159,8 +3093,8 @@ template static __device__ __forceinline__ void load_tiles_q6_ float * x_dmf = (float *) x_dm; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI6_K) { - int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) { + int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y; if (need_check) { i = min(i, i_max); @@ -3172,8 +3106,8 @@ template static __device__ __forceinline__ void load_tiles_q6_ } #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) { - int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; if (need_check) { i = min(i, i_max); @@ -3189,25 +3123,17 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q6_K_Q8_1_MMQ == 0); - const float * x_dmf = (const float *) x_dm; const float * y_df = (const float *) y_ds; const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]); - const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k; - const int index_y = j * (QR6_K*WARP_SIZE) + QR6_K*k; + const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k; + const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE; return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]); } -template static __global__ void mul_mat_q( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, @@ -3222,14 +3148,11 @@ static __global__ void mul_mat_q( const int & ncols_dst = ncols_y; - const int tid_x = threadIdx.x; - const int tid_y = threadIdx.y; - - const int row_dst_0 = blockIdx.x*GGML_CUDA_MMQ_Y; + const int row_dst_0 = blockIdx.x*mmq_y; const int & row_x_0 = row_dst_0; - const int row_dst = row_dst_0 + tid_x; + const int row_dst = row_dst_0 + threadIdx.x; - const int col_dst_0 = blockIdx.y*WARP_SIZE; + const int col_dst_0 = blockIdx.y*mmq_x; const int & col_y_0 = col_dst_0; int * tile_x_ql = nullptr; @@ -3239,64 +3162,65 @@ static __global__ void mul_mat_q( allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - const int blocks_per_tile_y_col = qr*WARP_SIZE/QI8_1; - - __shared__ int tile_y_qs[(WARP_SIZE) * (qr*WARP_SIZE)]; - __shared__ half2 tile_y_ds[(WARP_SIZE) * blocks_per_tile_y_col]; + __shared__ int tile_y_qs[mmq_x * WARP_SIZE]; + __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1]; - float sum[GGML_CUDA_MMQ_Y/WARP_SIZE][4] = {0.0f}; + float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f}; for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) { load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, - tid_y, nrows_x-row_x_0-1, tid_x, blocks_per_row_x); + threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x); +#pragma unroll for (int ir = 0; ir < qr; ++ir) { - const int kqs = ir*WARP_SIZE + tid_x; + const int kqs = ir*WARP_SIZE + threadIdx.x; const int kbxd = kqs / QI8_1; - for (int i = 0; i < WARP_SIZE; i += 8) { - const int col_y_eff = min(col_y_0 + tid_y + i, ncols_y-1); // to prevent out-of-bounds memory accesses +#pragma unroll + for (int i = 0; i < mmq_x; i += nwarps) { + const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd]; - tile_y_qs[(tid_y + i) * (qr*WARP_SIZE) + kqs] = get_int_from_int8_aligned(by0->qs, tid_x % QI8_1); + const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE; + tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1); } - } - for (int ids0 = 0; ids0 < WARP_SIZE; ids0 += 8 * (WARP_SIZE/blocks_per_tile_y_col)) { - const int ids = (ids0 + tid_y * (WARP_SIZE/blocks_per_tile_y_col) + tid_x / blocks_per_tile_y_col) % WARP_SIZE; - const int kby = tid_x % blocks_per_tile_y_col; - const int col_y_eff = min(col_y_0 + ids, ncols_y-1); - - // if the sum is not needed it's faster to transform the scale to f32 ahead of time - const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kby].ds; - half2 * dsi_dst = &tile_y_ds[ids * (qr*WARP_SIZE/QI8_1) + kby]; - if (need_sum) { - *dsi_dst = *dsi_src; - } else { - float * dfi_dst = (float *) dsi_dst; - *dfi_dst = (*dsi_src).x; +#pragma unroll + for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) { + const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x; + const int kby = threadIdx.x % (WARP_SIZE/QI8_1); + const int col_y_eff = min(col_y_0 + ids, ncols_y-1); + + // if the sum is not needed it's faster to transform the scale to f32 ahead of time + const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds; + half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby]; + if (need_sum) { + *dsi_dst = *dsi_src; + } else { + float * dfi_dst = (float *) dsi_dst; + *dfi_dst = (*dsi_src).x; + } } - } - __syncthreads(); + __syncthreads(); -#if __CUDA_ARCH__ >= 700 // Unrolling the loop is slower on Pascal +// #pragma unroll // unrolling this loop causes too much register pressure + for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) { #pragma unroll -#endif // __CUDA_ARCH__ >= 700 - for (int k = 0; k < WARP_SIZE; k += vdr) { + for (int j = 0; j < mmq_x; j += nwarps) { #pragma unroll - for (int j = 0; j < WARP_SIZE; j += 8) { -#pragma unroll - for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) { - sum[i/WARP_SIZE][j/8] += vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds, - tid_x + i, tid_y + j, k); + for (int i = 0; i < mmq_y; i += WARP_SIZE) { + sum[i/WARP_SIZE][j/nwarps] += vec_dot( + tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds, + threadIdx.x + i, threadIdx.y + j, k); + } } } - } - __syncthreads(); + __syncthreads(); + } } @@ -3304,15 +3228,15 @@ static __global__ void mul_mat_q( return; } - for (int j = 0; j < WARP_SIZE; j += 8) { - const int col_dst = col_dst_0 + j + tid_y; + for (int j = 0; j < mmq_x; j += nwarps) { + const int col_dst = col_dst_0 + j + threadIdx.y; if (col_dst >= ncols_dst) { return; } - for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) { - dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/8]; + for (int i = 0; i < mmq_y; i += WARP_SIZE) { + dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/nwarps]; } } } @@ -4014,17 +3938,52 @@ static void ggml_mul_mat_q4_0_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); - - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + if (compute_capability >= CC_TURING) { + const int mmq_x = 64; + const int mmq_y = 128; + const int nwarps = 4; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } else { - mul_mat_q, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const int mmq_x = 64; + const int mmq_y = 64; + const int nwarps = 4; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } } @@ -4032,17 +3991,53 @@ static void ggml_mul_mat_q4_1_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); - - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + if (compute_capability >= CC_TURING) { + const int mmq_x = 64; + const int mmq_y = 128; + const int nwarps = 4; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } else { - mul_mat_q, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const int mmq_x = 64; + const int mmq_y = 64; + const int nwarps = 8; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } + } } @@ -4050,17 +4045,52 @@ static void ggml_mul_mat_q5_0_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); - - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + if (compute_capability >= CC_TURING) { + const int mmq_x = 128; + const int mmq_y = 64; + const int nwarps = 4; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } else { - mul_mat_q, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const int mmq_x = 64; + const int mmq_y = 64; + const int nwarps = 8; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } } @@ -4068,17 +4098,52 @@ static void ggml_mul_mat_q5_1_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); - - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + if (compute_capability >= CC_TURING) { + const int mmq_x = 128; + const int mmq_y = 64; + const int nwarps = 8; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } else { - mul_mat_q, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const int mmq_x = 64; + const int mmq_y = 64; + const int nwarps = 8; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } } @@ -4086,17 +4151,52 @@ static void ggml_mul_mat_q8_0_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); - - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + if (compute_capability >= CC_TURING) { + const int mmq_x = 128; + const int mmq_y = 64; + const int nwarps = 4; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } else { - mul_mat_q, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const int mmq_x = 64; + const int mmq_y = 64; + const int nwarps = 8; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } } @@ -4104,17 +4204,52 @@ static void ggml_mul_mat_q2_K_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); - - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + if (compute_capability >= CC_TURING) { + const int mmq_x = 64; + const int mmq_y = 128; + const int nwarps = 4; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } else { - mul_mat_q, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const int mmq_x = 64; + const int mmq_y = 64; + const int nwarps = 8; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } } @@ -4122,17 +4257,52 @@ static void ggml_mul_mat_q3_K_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); - - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + if (compute_capability >= CC_TURING) { + const int mmq_x = 128; + const int mmq_y = 128; + const int nwarps = 4; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } else { - mul_mat_q, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const int mmq_x = 64; + const int mmq_y = 64; + const int nwarps = 8; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } } @@ -4140,17 +4310,52 @@ static void ggml_mul_mat_q4_K_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); - - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + if (compute_capability >= CC_TURING) { + const int mmq_x = 64; + const int mmq_y = 128; + const int nwarps = 4; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } else { - mul_mat_q, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const int mmq_x = 32; + const int mmq_y = 64; + const int nwarps = 8; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } } @@ -4158,17 +4363,52 @@ static void ggml_mul_mat_q5_K_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); - - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + if (compute_capability >= CC_TURING) { + const int mmq_x = 64; + const int mmq_y = 128; + const int nwarps = 4; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } else { - mul_mat_q, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const int mmq_x = 64; + const int mmq_y = 64; + const int nwarps = 8; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } } @@ -4176,17 +4416,52 @@ static void ggml_mul_mat_q6_K_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); - - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + if (compute_capability >= CC_TURING) { + const int mmq_x = 64; + const int mmq_y = 64; + const int nwarps = 4; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } else { - mul_mat_q, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const int mmq_x = 32; + const int mmq_y = 64; + const int nwarps = 8; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } } @@ -4361,20 +4636,6 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) { } -static void * g_scratch_buffer = nullptr; -static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default -static size_t g_scratch_offset = 0; - -static int g_device_count = -1; -static int g_main_device = 0; -static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES]; -static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; -static bool g_mul_mat_q = false; - -static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; - -static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr }; - void ggml_init_cublas() { static bool initialized = false; @@ -4730,6 +4991,37 @@ inline void ggml_cuda_op_mul_mat_q( (void) i1; } +static int64_t get_row_rounding(ggml_type type) { + int max_compute_capability = INT_MIN; + for (int id = 0; id < g_device_count; ++id) { + if (max_compute_capability < g_compute_capabilities[id] + && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) { + max_compute_capability = g_compute_capabilities[id]; + } + } + + switch(type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + return max_compute_capability >= CC_TURING ? 128 : 64; + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + return 64; + case GGML_TYPE_F16: + return 1; + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + return max_compute_capability >= CC_TURING ? 128 : 64; + case GGML_TYPE_Q6_K: + return 64; + default: + GGML_ASSERT(false); + } +} + inline void ggml_cuda_op_mul_mat_vec( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, @@ -5130,14 +5422,16 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm int64_t row_low, row_high; if (split) { + const int64_t rounding = get_row_rounding(src0->type); + row_low = id == 0 ? 0 : nrows0*g_tensor_split[id]; - row_low -= row_low % GGML_CUDA_MMQ_Y; + row_low -= row_low % rounding; if (id == g_device_count - 1) { row_high = nrows0; } else { row_high = nrows0*g_tensor_split[id + 1]; - row_high -= row_high % GGML_CUDA_MMQ_Y; + row_high -= row_high % rounding; } } else { row_low = 0; @@ -5616,14 +5910,16 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { row_low = 0; row_high = nrows; } else if (backend == GGML_BACKEND_GPU_SPLIT) { + const int64_t rounding = get_row_rounding(tensor->type); + row_low = id == 0 ? 0 : nrows*g_tensor_split[id]; - row_low -= row_low % GGML_CUDA_MMQ_Y; + row_low -= row_low % rounding; if (id == g_device_count - 1) { row_high = nrows; } else { row_high = nrows*g_tensor_split[id + 1]; - row_high -= row_high % GGML_CUDA_MMQ_Y; + row_high -= row_high % rounding; } } else { GGML_ASSERT(false); From 8f8ab6c4c049df501e9a5ed8fef3aa0fc0691421 Mon Sep 17 00:00:00 2001 From: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed, 9 Aug 2023 18:05:03 -0500 Subject: [PATCH 51/51] hipLDFLAG Path change Unix to multisystem in Makefile changed the hardcoded linux distro hipblas LD path from -L/opt/rocm/lib to use the defined ROCM_PATH variable to be flexible with ROCm on non-Linux OS --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 469d62de73bc3..0843d86c6b8d6 100644 --- a/Makefile +++ b/Makefile @@ -290,7 +290,7 @@ ifdef LLAMA_HIPBLAS LLAMA_CUDA_KQUANTS_ITER ?= 2 CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) - LDFLAGS += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 -lrocblas + LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 -lrocblas OBJS += ggml-cuda.o ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)