From f4fdd8fe2227453cf26ce22a016cac9448448549 Mon Sep 17 00:00:00 2001 From: Yevhenii Havrylko Date: Wed, 16 Oct 2024 16:37:53 -0400 Subject: [PATCH] Benchmark improvements (#2415) I've picked up some useful changes from https://github.com/intel/intel-xpu-backend-for-triton/pull/1905 and pushed them here. Also organized python library build. So basically it is a clean up with some features added and get it ready for the 2025 release List of changes: - Add benchmark project artifacts to gitignore - Compile python library using dedicated cmake api: https://cmake.org/cmake/help/latest/module/FindPython3.html - Suppress old style warning of xetla library - Prevent cmake build on clean up commands (before that it was unconditional - If there is no ipex, library will be compiled in no ipex mode with user warning - More modular setup.py - Verbose output of cmake commands being run - CMakeLists.txt cleanup - Fix shadow import usage of cmake library Closes #1905 --- .gitignore | 8 ++ benchmarks/CMakeLists.txt | 4 +- benchmarks/cmake/FindXeTLALibrary.cmake | 4 +- benchmarks/setup.py | 130 ++++++++++++------ benchmarks/xetla_kernel/CMakeLists.txt | 10 +- .../flash_attention/fmha_forward_v5.h | 2 + 6 files changed, 110 insertions(+), 48 deletions(-) diff --git a/.gitignore b/.gitignore index 1d4f942657..aede44e6cc 100644 --- a/.gitignore +++ b/.gitignore @@ -6,11 +6,19 @@ build-*/ python/build/ python/dist/ python/triton*.egg-info/ +python/*.whl python/triton/_C/*.pyd python/triton/_C/*.so python/triton/_C/*.dylib +benchmarks/dist +benchmarks/*.egg-info/ +benchmarks/**/*.so + +# Logs +inductor_log/ + # Backends copied from submodules python/triton/backends/ !python/triton/backends/__init__.py diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 7d1c59ea6f..98b2543fc3 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -10,9 +10,11 @@ if(NOT WIN32) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") endif() -find_package(Python3 COMPONENTS Interpreter) +find_package(Python3 REQUIRED + COMPONENTS Development.Module) find_package(Torch REQUIRED) find_library(TORCH_PYTHON_LIBRARY torch_python PATH "${TORCH_INSTALL_PREFIX}/lib") +find_package(XeTLALibrary REQUIRED) if(USE_IPEX) string(APPEND CMAKE_CXX_FLAGS " -DUSE_IPEX") diff --git a/benchmarks/cmake/FindXeTLALibrary.cmake b/benchmarks/cmake/FindXeTLALibrary.cmake index 9d7868dae3..5599a6f882 100644 --- a/benchmarks/cmake/FindXeTLALibrary.cmake +++ b/benchmarks/cmake/FindXeTLALibrary.cmake @@ -3,13 +3,15 @@ include(FetchContent) if (NOT XeTLALibrary_FOUND) + # TODO: switch ot FetchContent_MakeAvailable once XeTLA supports it + cmake_policy(SET CMP0169 OLD) set(XeTLALibrary_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/XeTLALibrary") message(STATUS "XeTLALibrary is not specified. Will try to download XeTLA library from https://github.com/intel/xetla into ${XeTLALibrary_SOURCE_DIR}") - file(READ xetla-library.conf XeTLALibrary_TAG) + file(READ xetla_kernel/xetla-library.conf XeTLALibrary_TAG) # Strip the potential trailing newline from tag string(STRIP "${XeTLALibrary_TAG}" XeTLALibrary_TAG) FetchContent_Declare(xetla-library diff --git a/benchmarks/setup.py b/benchmarks/setup.py index 1b3f0c55cb..7497f02585 100644 --- a/benchmarks/setup.py +++ b/benchmarks/setup.py @@ -1,83 +1,135 @@ import os -import re import shutil import subprocess -import sysconfig import sys -from setuptools import setup +# TODO: update once there is replacement for clean: +# https://github.com/pypa/setuptools/discussions/2838 +from distutils import log # pylint: disable=[deprecated-module] +from distutils.dir_util import remove_tree # pylint: disable=[deprecated-module] +from distutils.command.clean import clean as _clean # pylint: disable=[deprecated-module] + +from setuptools import setup, Extension +from setuptools.command.build_ext import build_ext as _build_ext import torch -ipex_cmake_prefix_path = "" -USE_IPEX_OPTION = os.getenv("USE_IPEX", "1") -if USE_IPEX_OPTION == "1": - import intel_extension_for_pytorch - ipex_cmake_prefix_path = f";{intel_extension_for_pytorch.cmake_prefix_path}" + +class CMakeExtension(Extension): + + def __init__(self, name): + # don't invoke the original build_ext for this special extension + super().__init__(name, sources=[]) class CMakeBuild(): - def __init__(self): + def __init__(self, debug=False, dry_run=False): self.current_dir = os.path.abspath(os.path.dirname(__file__)) self.build_temp = self.current_dir + "/build/temp" self.extdir = self.current_dir + "/triton_kernels_benchmark" + self.build_type = self.get_build_type(debug) + self.cmake_prefix_paths = [torch.utils.cmake_prefix_path] + self.use_ipex = False + self.dry_run = dry_run + + def get_build_type(self, debug): + DEBUG_OPTION = os.getenv("DEBUG", "0") + return "Debug" if debug or (DEBUG_OPTION == "1") else "Release" def run(self): - try: - out = subprocess.check_output(["cmake", "--version"]) - except OSError as error: - raise RuntimeError("CMake must be installed") from error + self.check_ipex() + self.build_extension() - match = re.search(r"version\s*(?P\d+)\.(?P\d+)([\d.]+)?", out.decode()) - cmake_major, cmake_minor = int(match.group("major")), int(match.group("minor")) - if (cmake_major, cmake_minor) < (3, 18): - raise RuntimeError("CMake >= 3.18.0 is required") + def check_ipex(self): + self.use_ipex = os.getenv("USE_IPEX", "1") == "1" + if not self.use_ipex: + return + try: + import intel_extension_for_pytorch + except ImportError: + log.warn("ipex is not installed trying to build without ipex") + self.use_ipex = False + return + self.cmake_prefix_paths.append(intel_extension_for_pytorch.cmake_prefix_path) - self.build_extension() + def check_call(self, *popenargs, **kwargs): + log.info(" ".join(popenargs[0])) + if not self.dry_run: + subprocess.check_call(*popenargs, **kwargs) def build_extension(self): ninja_dir = shutil.which("ninja") # create build directories if not os.path.exists(self.build_temp): os.makedirs(self.build_temp) - # python directories - python_include_dir = sysconfig.get_path("platinclude") cmake_args = [ "-G", "Ninja", # Ninja is much faster than make "-DCMAKE_MAKE_PROGRAM=" + ninja_dir, # Pass explicit path to ninja otherwise cmake may cache a temporary path - f"-DCMAKE_PREFIX_PATH={torch.utils.cmake_prefix_path}{ipex_cmake_prefix_path}", - f"-DUSE_IPEX={USE_IPEX_OPTION}", - "-DCMAKE_EXPORT_COMPILE_COMMANDS=ON", - "-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY=" + self.extdir, - "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + self.extdir, - "-DPython3_EXECUTABLE:FILEPATH=" + sys.executable, - "-DCMAKE_VERBOSE_MAKEFILE:BOOL=ON", - "-DPYTHON_INCLUDE_DIRS=" + python_include_dir, + "-DCMAKE_PREFIX_PATH=" + ";".join(self.cmake_prefix_paths), + f"-DUSE_IPEX={int(self.use_ipex)}", + "-DCMAKE_INSTALL_PREFIX=" + self.extdir, + "-DPython3_ROOT_DIR:FILEPATH=" + sys.exec_prefix, + "-DCMAKE_VERBOSE_MAKEFILE=TRUE", "-DCMAKE_C_COMPILER=icx", "-DCMAKE_CXX_COMPILER=icpx", + "-DCMAKE_BUILD_TYPE=" + self.build_type, + "-S", + self.current_dir, + "-B", + self.build_temp, ] - # configuration - build_type = "Debug" - build_args = ["--config", build_type] - cmake_args += ["-DCMAKE_BUILD_TYPE=" + build_type] max_jobs = os.getenv("MAX_JOBS", str(2 * os.cpu_count())) - build_args += ["-j" + max_jobs] + build_args = [ + "--build", + self.build_temp, + "-j" + max_jobs, + ] + + install_args = [ + "--build", + self.build_temp, + "--target", + "install", + ] env = os.environ.copy() - cmake_dir = self.build_temp - subprocess.check_call(["cmake", self.current_dir] + cmake_args, cwd=cmake_dir, env=env) - subprocess.check_call(["cmake", "--build", "."] + build_args, cwd=cmake_dir) + self.check_call(["cmake"] + cmake_args, env=env) + self.check_call(["cmake"] + build_args) + self.check_call(["cmake"] + install_args) + + def clean(self): + if os.path.exists(self.build_temp): + remove_tree(self.build_temp, dry_run=self.dry_run) + else: + log.warn("'%s' does not exist -- can't clean it", os.path.relpath(self.build_temp, + os.path.dirname(__file__))) + +class build_ext(_build_ext): + + def run(self): + cmake = CMakeBuild(debug=self.debug, dry_run=self.dry_run) + cmake.run() + super().run() + + +class clean(_clean): + + def run(self): + cmake = CMakeBuild(dry_run=self.dry_run) + cmake.clean() + super().run() -cmake = CMakeBuild() -cmake.run() setup(name="triton-kernels-benchmark", packages=[ "triton_kernels_benchmark", ], package_dir={ "triton_kernels_benchmark": "triton_kernels_benchmark", -}, package_data={"triton_kernels_benchmark": ["xetla_kernel.so"]}) +}, package_data={"triton_kernels_benchmark": ["xetla_kernel.cpython-*.so"]}, cmdclass={ + "build_ext": build_ext, + "clean": clean, +}, ext_modules=[CMakeExtension("triton_kernels_benchmark")]) diff --git a/benchmarks/xetla_kernel/CMakeLists.txt b/benchmarks/xetla_kernel/CMakeLists.txt index beeb6dc432..439849f5c8 100644 --- a/benchmarks/xetla_kernel/CMakeLists.txt +++ b/benchmarks/xetla_kernel/CMakeLists.txt @@ -1,7 +1,3 @@ -# XeTLA library is required. -find_package(XeTLALibrary REQUIRED) -set(CMAKE_CXX_STANDARD 20) - set(XETLA_KERNEL_FLAGS ${XETLA_KERNEL_FLAGS} -fsycl -fsycl-device-code-split=per_kernel @@ -29,8 +25,7 @@ else() set(XETLA_KERNEL_FLAGS ${XETLA_KERNEL_FLAGS} "${XETLA_OFFLINE_OPTIONS}") endif() -add_library(xetla_kernel SHARED python_main.cpp) -set_target_properties(xetla_kernel PROPERTIES PREFIX "") +Python3_add_library(xetla_kernel MODULE WITH_SOABI python_main.cpp) target_compile_options(xetla_kernel PRIVATE "-fPIC") if(USE_IPEX) target_compile_options(xetla_kernel PRIVATE "-fsycl") @@ -40,7 +35,6 @@ endif() target_compile_options(xetla_kernel PUBLIC "-DXETPP_NEW_XMAIN") target_link_options(xetla_kernel PRIVATE ${XETLA_KERNEL_FLAGS}) target_link_libraries(xetla_kernel PUBLIC ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIBRARY}) -target_include_directories(xetla_kernel PUBLIC "${PYTHON_INCLUDE_DIRS}") target_include_directories(xetla_kernel PUBLIC "${XeTLALibrary_INCLUDE_DIR}") if(USE_IPEX) @@ -52,3 +46,5 @@ add_subdirectory(softmax) add_subdirectory(gemm) add_subdirectory(stream_k_gemm) add_subdirectory(flash_attention) + +install(TARGETS xetla_kernel LIBRARY DESTINATION .) diff --git a/benchmarks/xetla_kernel/flash_attention/fmha_forward_v5.h b/benchmarks/xetla_kernel/flash_attention/fmha_forward_v5.h index 7562d796dc..cd5a3de12b 100644 --- a/benchmarks/xetla_kernel/flash_attention/fmha_forward_v5.h +++ b/benchmarks/xetla_kernel/flash_attention/fmha_forward_v5.h @@ -16,6 +16,8 @@ #ifndef TRITONBENCHMARK_FMHA_FWD_V5_H #define TRITONBENCHMARK_FMHA_FWD_V5_H +#include + #include "fmha_policy_v2.h" #include "fmha_utils.h" #include "xetla.hpp"