From e11048a3aea5067ffa7d3167576ff2c8103ab71f Mon Sep 17 00:00:00 2001
From: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Date: Fri, 15 Sep 2023 19:39:05 -0500
Subject: [PATCH] Separate CuBLAS/hipBLAS

---
 .gitignore     |  4 ++++
 CMakeLists.txt | 38 ++++++++++++++++++++++++++++----------
 Makefile       | 23 +++++++++++------------
 koboldcpp.py   | 35 ++++++++++++++++++++++++-----------
 4 files changed, 67 insertions(+), 33 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2fb1af29f9168..1a87853fd635f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -84,3 +84,7 @@ tests/test-tokenizer-0
 /koboldcpp_cublas.dll
 /cublas64_11.dll
 /cublasLt64_11.dll
+/rocblas/
+rocblas.dll
+hipblas.dll
+koboldcpp_hipblas.so
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 58474caa97f5f..3afc1461f82d1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -124,7 +124,12 @@ if (LLAMA_CUBLAS)
 endif()
 
 if (LLAMA_HIPBLAS)
-    list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
+    if (MSVC)
+        list(APPEND CMAKE_PREFIX_PATH "C:/Program Files/AMD/ROCm/5.5")
+    else()
+        list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
+    endif()
+
 
     if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
         message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
@@ -387,16 +392,29 @@ target_compile_features(gpttype_adapter PUBLIC cxx_std_11) # don't bump
 target_link_libraries(gpttype_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
 set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
+if (LLAMA_CUBLAS)
+    set(TARGET koboldcpp_cublas)
+    add_library(${TARGET} SHARED expose.cpp expose.h)
+    target_include_directories(${TARGET} PUBLIC . ./otherarch ./otherarch/tools ./examples ./common)
+    target_compile_features(${TARGET} PUBLIC cxx_std_11) # don't bump
+    set_target_properties(${TARGET} PROPERTIES PREFIX "")
+    set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas")
+    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 common2 gpttype_adapter ${LLAMA_EXTRA_LIBS})
+    target_compile_features(${TARGET} PRIVATE cxx_std_11)
+endif()
 
-set(TARGET koboldcpp_cublas)
-add_library(${TARGET} SHARED expose.cpp expose.h)
-target_include_directories(${TARGET} PUBLIC . ./otherarch ./otherarch/tools ./examples ./common)
-target_compile_features(${TARGET} PUBLIC cxx_std_11) # don't bump
-set_target_properties(${TARGET} PROPERTIES PREFIX "")
-set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas")
-set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 common2 gpttype_adapter ${LLAMA_EXTRA_LIBS})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if (LLAMA_HIPBLAS)
+    set(TARGET koboldcpp_hipblas)
+    add_library(${TARGET} SHARED expose.cpp expose.h)
+    target_include_directories(${TARGET} PUBLIC . ./otherarch ./otherarch/tools ./examples ./common)
+    target_compile_features(${TARGET} PUBLIC cxx_std_11) # don't bump
+    set_target_properties(${TARGET} PROPERTIES PREFIX "")
+    set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_hipblas")
+    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 common2 gpttype_adapter ${LLAMA_EXTRA_LIBS})
+    target_compile_features(${TARGET} PRIVATE cxx_std_11)
+endif()
 
 
 if (MAKE_MISC_FILES)
diff --git a/Makefile b/Makefile
index 4eb27e369cb1d..cadbeedb11786 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-default: koboldcpp_default koboldcpp_failsafe koboldcpp_openblas koboldcpp_noavx2 koboldcpp_clblast koboldcpp_cublas
+default: koboldcpp_default koboldcpp_failsafe koboldcpp_openblas koboldcpp_noavx2 koboldcpp_clblast koboldcpp_cublas koboldcpp_hipblas 
 tools: quantize_gpt2 quantize_gptj quantize_llama quantize_neox quantize_mpt
 dev: koboldcpp_openblas
 dev2: koboldcpp_clblast
@@ -39,8 +39,8 @@ endif
 #
 
 # keep standard at C11 and C++11
-CFLAGS   = -I.              -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11   -fPIC -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS
-CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS
+CFLAGS   = -I.              -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11   -fPIC -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -D_GNU_SOURCE
+CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -D_GNU_SOURCE
 LDFLAGS  =
 
 # these are used on windows, to build some libraries with extra old device compatibility
@@ -211,18 +211,15 @@ endif # LLAMA_CUDA_FORCE_DMMV
 ggml-cuda.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \
 						-DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) \
                         -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) \
-                        -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) \
-						-DCC_TURING=1000000000
+                        -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) 
 ggml_v2-cuda.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \
 						-DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) \
                         -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) \
-                        -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) \
-						-DCC_TURING=1000000000
+                        -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) 
 ggml_v2-cuda-legacy.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \
 						-DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) \
                         -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) \
-                        -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) \
-						-DCC_TURING=1000000000 # DGGML_CUDA_DMMV_F16 does not currently work with AMD.
+                        -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) 
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(CXX) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
 ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h
@@ -417,7 +414,7 @@ gpttype_adapter_cublas.o: $(GPTTYPE_ADAPTER)
 	$(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
 
 clean:
-	rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so
+	rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so koboldcpp_hipblas.so
 
 main: examples/main/main.cpp build-info.h ggml.o k_quants.o ggml-alloc.o llama.o common.o console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@@ -439,8 +436,10 @@ koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o com
 	$(NOAVX2_BUILD)
 koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants.o ggml-alloc.o $(OBJS)
 	$(CLBLAST_BUILD)
-koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o $(CUBLAS_OBJS) $(HIP_OBJS) $(OBJS)
-	$(CUBLAS_BUILD) $(HIPBLAS_BUILD)
+koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o $(CUBLAS_OBJS) $(OBJS)
+	$(CUBLAS_BUILD)
+koboldcpp_hipblas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o $(HIP_OBJS) $(OBJS)
+	$(HIPBLAS_BUILD)
 
 quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o k_quants.o ggml-alloc.o
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
diff --git a/koboldcpp.py b/koboldcpp.py
index beb3fe16dd2e1..70b6e80708f09 100755
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -103,6 +103,7 @@ def pick_existant_file(ntoption,nonntoption):
 lib_noavx2 = pick_existant_file("koboldcpp_noavx2.dll","koboldcpp_noavx2.so")
 lib_clblast = pick_existant_file("koboldcpp_clblast.dll","koboldcpp_clblast.so")
 lib_cublas = pick_existant_file("koboldcpp_cublas.dll","koboldcpp_cublas.so")
+lib_hipblas = pick_existant_file("koboldcpp_hipblas.dll","koboldcpp_hipblas.so")
 
 
 def init_library():
@@ -113,6 +114,7 @@ def init_library():
     use_openblas = False # if true, uses OpenBLAS for acceleration. libopenblas.dll must exist in the same dir.
     use_clblast = False #uses CLBlast instead
     use_cublas = False #uses cublas instead
+    use_hipblas = False #uses hipblas instead
     use_noavx2 = False #uses no avx2 instructions
     use_failsafe = False #uses no intrinsics, failsafe mode
     if args.noavx2:
@@ -131,11 +133,16 @@ def init_library():
             print("Attempting to use CLBlast library for faster prompt ingestion. A compatible clblast will be required.")
             use_clblast = True
     elif (args.usecublas is not None):
-        if not file_exists(lib_cublas):
+        if not file_exists(lib_cublas) and not file_exists(lib_hipblas):
             print("Warning: CuBLAS library file not found. Non-BLAS library will be used.")
         else:
-            print("Attempting to use CuBLAS library for faster prompt ingestion. A compatible CuBLAS will be required.")
-            use_cublas = True
+            if file_exists(lib_cublas):
+                print("Attempting to use CuBLAS library for faster prompt ingestion. A compatible CuBLAS will be required.")
+                use_cublas = True
+            elif file_exists(lib_hipblas):
+                print("Attempting to use hipBLAS library for faster prompt ingestion. A compatible AMD GPU will be required.")
+                use_hipblas = True
+
     else:
         if not file_exists(lib_openblas) or (os.name=='nt' and not file_exists("libopenblas.dll")):
             print("Warning: OpenBLAS library file not found. Non-BLAS library will be used.")
@@ -157,6 +164,8 @@ def init_library():
             libname = lib_clblast
         elif use_cublas:
             libname = lib_cublas
+        elif use_hipblas:
+            libname = lib_hipblas
         elif use_openblas:
             libname = lib_openblas
         else:
@@ -766,10 +775,11 @@ def show_new_gui():
         (lib_openblas, "Use OpenBLAS"),
         (lib_clblast, "Use CLBlast"),
         (lib_cublas, "Use CuBLAS"),
+        (lib_hipblas, "Use hipBLAS (ROCm)"),
         (lib_default, "Use No BLAS"),
         (lib_noavx2, "NoAVX2 Mode (Old CPU)"),
         (lib_failsafe, "Failsafe Mode (Old CPU)")]
-    openblas_option, clblast_option, cublas_option, default_option, noavx2_option, failsafe_option = (opt if file_exists(lib) or (os.name == 'nt' and file_exists(opt + ".dll")) else None for lib, opt in lib_option_pairs)
+    openblas_option, clblast_option, cublas_option, hipblas_option, default_option, noavx2_option, failsafe_option = (opt if file_exists(lib) or (os.name == 'nt' and file_exists(opt + ".dll")) else None for lib, opt in lib_option_pairs)
     # slider data
     blasbatchsize_values = ["-1", "32", "64", "128", "256", "512", "1024", "2048"]
     blasbatchsize_text = ["Don't Batch BLAS","32","64","128","256","512","1024","2048"]
@@ -922,7 +932,7 @@ def setup_backend_tooltip(parent):
 
     def changerunmode(a,b,c):
         index = runopts_var.get()
-        if index == "Use CLBlast" or index == "Use CuBLAS":
+        if index == "Use CLBlast" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
             gpu_selector_label.grid(row=3, column=0, padx = 8, pady=1, stick="nw")
             quick_gpu_selector_label.grid(row=3, column=0, padx = 8, pady=1, stick="nw")
             if index == "Use CLBlast":
@@ -930,7 +940,7 @@ def changerunmode(a,b,c):
                 quick_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw")
                 if gpu_choice_var.get()=="All":
                     gpu_choice_var.set("1")
-            elif index == "Use CuBLAS":
+            elif index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
                 CUDA_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw")
                 CUDA_quick_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw")
         else:
@@ -941,7 +951,7 @@ def changerunmode(a,b,c):
             quick_gpu_selector_box.grid_forget()
             CUDA_quick_gpu_selector_box.grid_forget()
 
-        if index == "Use CuBLAS":
+        if index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
             lowvram_box.grid(row=4, column=0, padx=8, pady=1,  stick="nw")
             quick_lowvram_box.grid(row=4, column=0, padx=8, pady=1,  stick="nw")
             mmq_box.grid(row=4, column=1, padx=8, pady=1,  stick="nw")
@@ -952,7 +962,7 @@ def changerunmode(a,b,c):
             mmq_box.grid_forget()
             quick_mmq_box.grid_forget()
 
-        if index == "Use CLBlast" or index == "Use CuBLAS":
+        if index == "Use CLBlast" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
             gpu_layers_label.grid(row=5, column=0, padx = 8, pady=1, stick="nw")
             gpu_layers_entry.grid(row=5, column=1, padx=8, pady=1, stick="nw")
             quick_gpu_layers_label.grid(row=5, column=0, padx = 8, pady=1, stick="nw")
@@ -1147,7 +1157,7 @@ def export_vars():
             gpuchoiceidx = int(gpu_choice_var.get())-1
         if runopts_var.get() == "Use CLBlast":
             args.useclblast = [[0,0], [1,0], [0,1], [1,1]][gpuchoiceidx]
-        if runopts_var.get() == "Use CuBLAS":
+        if runopts_var.get() == "Use CuBLAS" or runopts_var.get() == "Use hipBLAS (ROCm)":
             if gpu_choice_var.get()=="All":
                 args.usecublas = ["lowvram"] if lowvram_var.get() == 1 else ["normal"]
             else:
@@ -1204,8 +1214,11 @@ def import_vars(dict):
                 runopts_var.set(clblast_option)
                 gpu_choice_var.set(str(["0 0", "1 0", "0 1", "1 1"].index(str(dict["useclblast"][0]) + " " + str(dict["useclblast"][1])) + 1))
         elif "usecublas" in dict and dict["usecublas"]:
-            if cublas_option is not None:
-                runopts_var.set(cublas_option)
+            if cublas_option is not None or hipblas_option is not None:
+                if cublas_option:
+                    runopts_var.set(cublas_option)
+                elif hipblas_option:
+                    runopts_var.set(cublas_option)
                 lowvram_var.set(1 if "lowvram" in dict["usecublas"] else 0)
                 mmq_var.set(1 if "mmq" in dict["usecublas"] else 0)
                 gpu_choice_var.set("All")