diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py index 6286589293..9c1d25b2c1 100644 --- a/third_party/intel/backend/compiler.py +++ b/third_party/intel/backend/compiler.py @@ -7,6 +7,8 @@ from types import ModuleType import hashlib import re +import tempfile +import signal import os import shutil import subprocess @@ -51,6 +53,7 @@ class XPUOptions: max_num_imprecise_acc_default: int = 0 # `max_num_imprecise_acc` only applies to fp8 -> fp32 dot on sm_90 for cuda extern_libs: dict = None debug: bool = False + generate_native_code: bool = False backend_name: str = 'intel' def __post_init__(self): @@ -62,6 +65,7 @@ def __post_init__(self): object.__setattr__(self, 'extern_libs', tuple(extern_libs.items())) if self.num_warps <= 0 or (self.num_warps & (self.num_warps - 1)) != 0: raise AssertionError("num_warps must be a power of 2") + self.generate_native_code = bool(os.getenv("TRITON_XPU_GEN_NATIVE_CODE", self.generate_native_code)) def hash(self): key = '_'.join([f'{name}-{val}' for name, val in self.__dict__.items()]) @@ -284,7 +288,7 @@ def make_llir(src, metadata, options): @staticmethod def make_spv(src, metadata, options): - ret, name = intel.translate_to_spirv(src) + spirv, name = intel.translate_to_spirv(src) metadata["name"] = name if options.grf_mode == 'small': metadata["build_flags"] = "-cl-intel-128-GRF-per-thread" @@ -297,7 +301,60 @@ def make_spv(src, metadata, options): else: metadata["build_flags"] = "" - return ret + if options.generate_native_code: + with tempfile.NamedTemporaryFile(delete=False, mode='wb', suffix='.spv') as fsrc, \ + tempfile.NamedTemporaryFile(delete=False, mode='r', suffix='.log') as flog: + fsrc.write(spirv) + fsrc.flush() + fbin = fsrc.name + '.o' + + ocloc_cmd = [ + 'ocloc', 'compile', '-file', fsrc.name, '-o', fbin, '-spirv_input', '-device', 'pvc', '-options', + metadata["build_flags"] + ] + + try: + subprocess.run(ocloc_cmd, check=True, close_fds=False, stdout=flog, stderr=subprocess.STDOUT) + if os.path.exists(flog.name): + with open(flog.name) as log_file: + log = log_file.read().strip() + if 'spilled' in log: + """ + The exact message is something like: + warning: kernel matmul_kernel compiled SIMD16 allocated 128 regs and spilled around 217 + is "spilled" enough for now? + """ + metadata["build_flags"] += " -cl-intel-256-GRF-per-thread" + # re-run with new build flags + ocloc_cmd[-1] = metadata["build_flags"] + subprocess.run(ocloc_cmd, check=True, close_fds=False, stdout=flog, + stderr=subprocess.STDOUT) + os.remove(flog.name) + if os.path.exists(fsrc.name): + os.remove(fsrc.name) + except subprocess.CalledProcessError as e: + with open(flog.name) as log_file: + log = log_file.read() + if os.path.exists(flog.name): + os.remove(flog.name) + + if e.returncode == 255: + error = 'Internal Triton ZEBIN codegen error' + elif e.returncode == 128 + signal.SIGSEGV: + error = '`ocloc` raised SIGSEGV' + else: + error = f'`ocloc` failed with error code {e.returncode}' + + raise RuntimeError(f'{error}\n' + f'`ocloc` stderr:\n{log}\n' + f'Repro command: {ocloc_cmd}\n') + + with open(fbin, 'rb') as f: + zebin = f.read() + if os.path.exists(fbin): + os.remove(fbin) + return zebin + return spirv def add_stages(self, stages, options): stages["ttir"] = lambda src, metadata: self.make_ttir(src, metadata, options) diff --git a/third_party/intel/backend/driver.c b/third_party/intel/backend/driver.c index d0c1a9654c..77dfb81d08 100644 --- a/third_party/intel/backend/driver.c +++ b/third_party/intel/backend/driver.c @@ -134,8 +134,7 @@ static PyObject *loadBinary(PyObject *self, PyObject *args) { const sycl::device sycl_device = sycl_l0_device_pair.first; std::string kernel_name = name; - size_t binary_size = PyBytes_Size(py_bytes); - binary_size = binary_size / sizeof(uint32_t); + const size_t binary_size = PyBytes_Size(py_bytes); uint8_t *binary_ptr = (uint8_t *)PyBytes_AsString(py_bytes); const auto ctx = sycl_device.get_platform().ext_oneapi_get_default_context(); @@ -143,8 +142,13 @@ static PyObject *loadBinary(PyObject *self, PyObject *args) { sycl::get_native(sycl_device); const auto l0_context = sycl::get_native(ctx); + + const auto use_native_code = + isEnvValueBool(getStrEnv("TRITON_XPU_GEN_NATIVE_CODE")); + const bool is_spv = use_native_code ? !(*use_native_code) : true; + auto l0_module = checkSyclErrors(create_module( - l0_context, l0_device, binary_ptr, binary_size, build_flags)); + l0_context, l0_device, binary_ptr, binary_size, build_flags, is_spv)); auto checkL0Errors = [&](auto l0_module) -> ze_kernel_handle_t { if (PyErr_Occurred()) { @@ -169,35 +173,39 @@ static PyObject *loadBinary(PyObject *self, PyObject *args) { int32_t n_spills = props.spillMemSize; const int32_t n_regs = 0; - constexpr int32_t max_reg_spill = 1000; - std::string build_flags_str(build_flags); - bool is_GRF_mode_specified = false; - - // Check whether the GRF mode is specified by the build flags. - if (build_flags_str.find("-cl-intel-256-GRF-per-thread") != - std::string::npos || - build_flags_str.find("-cl-intel-128-GRF-per-thread") != - std::string::npos || - build_flags_str.find("-cl-intel-enable-auto-large-GRF-mode") != - std::string::npos) { - is_GRF_mode_specified = true; - } - // If the register mode isn't set, and the number of spills is greater - // than the threshold, recompile the kernel using large GRF mode. - if (!is_GRF_mode_specified && n_spills > max_reg_spill) { - std::cout << "(I): Detected " << n_spills - << " spills, recompiling the kernel using large GRF mode" - << std::endl; - const std::string new_build_flags = - build_flags_str.append(" -cl-intel-256-GRF-per-thread"); - l0_module = - checkSyclErrors(create_module(l0_context, l0_device, binary_ptr, - binary_size, new_build_flags.c_str())); - l0_kernel = checkL0Errors(l0_module); - gpuAssert(zeKernelGetProperties(l0_kernel, &props)); - n_spills = props.spillMemSize; - std::cout << "(I): Kernel has now " << n_spills << " spills" << std::endl; + if (is_spv) { + constexpr int32_t max_reg_spill = 1000; + std::string build_flags_str(build_flags); + bool is_GRF_mode_specified = false; + + // Check whether the GRF mode is specified by the build flags. + if (build_flags_str.find("-cl-intel-256-GRF-per-thread") != + std::string::npos || + build_flags_str.find("-cl-intel-128-GRF-per-thread") != + std::string::npos || + build_flags_str.find("-cl-intel-enable-auto-large-GRF-mode") != + std::string::npos) { + is_GRF_mode_specified = true; + } + + // If the register mode isn't set, and the number of spills is greater + // than the threshold, recompile the kernel using large GRF mode. + if (!is_GRF_mode_specified && n_spills > max_reg_spill) { + std::cout << "(I): Detected " << n_spills + << " spills, recompiling the kernel using large GRF mode" + << std::endl; + const std::string new_build_flags = + build_flags_str.append(" -cl-intel-256-GRF-per-thread"); + l0_module = checkSyclErrors( + create_module(l0_context, l0_device, binary_ptr, binary_size, + new_build_flags.c_str(), is_spv)); + + l0_kernel = checkL0Errors(l0_module); + gpuAssert(zeKernelGetProperties(l0_kernel, &props)); + n_spills = props.spillMemSize; + std::cout << "(I): Kernel has now " << n_spills << " spills" << std::endl; + } } auto mod = new sycl::kernel_bundle( diff --git a/third_party/intel/backend/include/sycl_functions.h b/third_party/intel/backend/include/sycl_functions.h index f50cdb727c..592cac947c 100644 --- a/third_party/intel/backend/include/sycl_functions.h +++ b/third_party/intel/backend/include/sycl_functions.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -23,6 +24,20 @@ typedef struct l0_resc_handles { using SyclQueueMap = std::unordered_map; +// Create an exception handler for asynchronous SYCL exceptions +auto exception_handler = [](sycl::exception_list e_list) { + for (std::exception_ptr const &e : e_list) { + try { + std::rethrow_exception(e); + } catch (std::exception const &e) { +#if _DEBUG + std::cout << "Failure" << std::endl; +#endif + std::terminate(); + } + } +}; + inline std::string parseZeResultCode(const ze_result_t code) { const std::string prefix = "Triton Error [ZE]: "; std::stringstream ss; @@ -37,6 +52,15 @@ inline std::string parseZeResultCode(const ze_result_t code) { } \ } +// TODO: share Triton GetEnv.hpp impl +inline std::string getStrEnv(const std::string &env) { + const char *cstr = std::getenv(env.c_str()); + if (!cstr) + return ""; + std::string result(cstr); + return result; +} + bool getBoolEnv(const std::string &env) { const char *s = std::getenv(env.c_str()); std::string str(s ? s : ""); @@ -45,19 +69,29 @@ bool getBoolEnv(const std::string &env) { return (str == "on" || str == "true" || str == "1"); } +inline std::optional isEnvValueBool(std::string str) { + std::transform(str.begin(), str.end(), str.begin(), + [](unsigned char c) { return std::tolower(c); }); + if (str == "on" || str == "true" || str == "1") + return true; + if (str == "off" || str == "false" || str == "0") + return false; + return std::nullopt; +} + std::tuple create_module(ze_context_handle_t context, ze_device_handle_t device, - uint8_t *binary_ptr, size_t binary_size, - const char *build_flags) { + uint8_t *binary_ptr, size_t binary_size, const char *build_flags, + const bool is_spv = true) { assert(binary_ptr != nullptr && "binary_ptr should not be NULL"); assert(build_flags != nullptr && "build_flags should not be NULL"); - const ze_module_format_t format = ZE_MODULE_FORMAT_IL_SPIRV; + const ze_module_format_t format = + is_spv ? ZE_MODULE_FORMAT_IL_SPIRV : ZE_MODULE_FORMAT_NATIVE; ze_module_desc_t module_description = {}; module_description.stype = ZE_STRUCTURE_TYPE_MODULE_DESC; module_description.format = format; - module_description.inputSize = - static_cast(binary_size * sizeof(uint32_t)); + module_description.inputSize = static_cast(binary_size); module_description.pInputModule = binary_ptr; module_description.pBuildFlags = build_flags; ze_module_build_log_handle_t buildlog; diff --git a/utils/SPIRVRunner/SPIRVRunner.cpp b/utils/SPIRVRunner/SPIRVRunner.cpp index 1798598406..3d257468da 100644 --- a/utils/SPIRVRunner/SPIRVRunner.cpp +++ b/utils/SPIRVRunner/SPIRVRunner.cpp @@ -10,20 +10,6 @@ #include "sycl_functions.h" -// Create an exception handler for asynchronous SYCL exceptions -static auto exception_handler = [](sycl::exception_list e_list) { - for (std::exception_ptr const &e : e_list) { - try { - std::rethrow_exception(e); - } catch (std::exception const &e) { -#if _DEBUG - std::cout << "Failure" << std::endl; -#endif - std::terminate(); - } - } -}; - auto load_tensor(const std::string &filename) { std::ifstream ins(filename, std::ios::binary); if (!ins.is_open()) { @@ -297,9 +283,8 @@ int main() { auto spirv = read_spirv("add_kernel.spv"); std::cout << "Read " << spirv.size() << " byte kernel." << std::endl; - auto [kernel_bundle, kernel, n_regs, n_spills] = - loadBinary("add_kernel", reinterpret_cast(spirv.data()), - spirv.size() / sizeof(uint32_t), 0); + auto [kernel_bundle, kernel, n_regs, n_spills] = loadBinary( + "add_kernel", reinterpret_cast(spirv.data()), spirv.size(), 0); // TODO: missing number of registers std::cout << "Loaded kernel with " << n_regs << " registers and " << n_spills