From 9d8ecbc19d3cef81ab877890b00e912a548ddd6b Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 27 Nov 2024 13:06:26 -0800 Subject: [PATCH 01/29] integrate ruff changes --- cuda_core/cuda/core/experimental/__init__.py | 1 + cuda_core/cuda/core/experimental/_linker.py | 285 +++++++++++++++++++ cuda_core/docs/source/api.rst | 5 + cuda_core/docs/source/release.md | 1 + cuda_core/docs/source/release/0.1.0-notes.md | 4 +- cuda_core/docs/source/release/0.2.0-notes.md | 11 + cuda_core/tests/test_linker.py | 101 +++++++ 7 files changed, 406 insertions(+), 2 deletions(-) create mode 100644 cuda_core/cuda/core/experimental/_linker.py create mode 100644 cuda_core/docs/source/release/0.2.0-notes.md create mode 100644 cuda_core/tests/test_linker.py diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index 9b978398..12fed225 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -5,5 +5,6 @@ from cuda.core.experimental._device import Device from cuda.core.experimental._event import EventOptions from cuda.core.experimental._launcher import LaunchConfig, launch +from cuda.core.experimental._linker import Linker, LinkerOptions from cuda.core.experimental._program import Program from cuda.core.experimental._stream import Stream, StreamOptions diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py new file mode 100644 index 00000000..e80bfe61 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -0,0 +1,285 @@ +from dataclasses import dataclass +from typing import List, Optional + +from cuda.bindings import nvjitlink +from cuda.core.experimental._module import ObjectCode +from cuda.core.experimental._utils import check_or_create_options + + +@dataclass +class LinkerOptions: + """Customizable :obj:`LinkerOptions` for nvJitLink. + + Attributes + ---------- + arch : str + Pass SM architecture value. Can use compute_ value instead if only generating PTX. + This is a required option. + Acceptable value type: str + Maps to: -arch=sm_ + max_register_count : int, optional + Maximum register count. + Default: None + Acceptable value type: int + Maps to: -maxrregcount= + time : bool, optional + Print timing information to InfoLog. + Default: False + Acceptable value type: bool + Maps to: -time + verbose : bool, optional + Print verbose messages to InfoLog. + Default: False + Acceptable value type: bool + Maps to: -verbose + link_time_optimization : bool, optional + Perform link time optimization. + Default: False + Acceptable value type: bool + Maps to: -lto + ptx : bool, optional + Emit PTX after linking instead of CUBIN; only supported with -lto. + Default: False + Acceptable value type: bool + Maps to: -ptx + optimization_level : int, optional + Set optimization level. Only 0 and 3 are accepted. + Default: None + Acceptable value type: int + Maps to: -O + debug : bool, optional + Generate debug information. + Default: False + Acceptable value type: bool + Maps to: -g + lineinfo : bool, optional + Generate line information. + Default: False + Acceptable value type: bool + Maps to: -lineinfo + ftz : bool, optional + Flush denormal values to zero. + Default: False + Acceptable value type: bool + Maps to: -ftz= + prec_div : bool, optional + Use precise division. + Default: True + Acceptable value type: bool + Maps to: -prec-div= + prec_sqrt : bool, optional + Use precise square root. + Default: True + Acceptable value type: bool + Maps to: -prec-sqrt= + fma : bool, optional + Use fast multiply-add. + Default: True + Acceptable value type: bool + Maps to: -fma= + kernels_used : List[str], optional + Pass list of kernels that are used; any not in the list can be removed. This option can be specified multiple + times. + Default: None + Acceptable value type: list of str + Maps to: -kernels-used= + variables_used : List[str], optional + Pass list of variables that are used; any not in the list can be removed. This option can be specified multiple + times. + Default: None + Acceptable value type: list of str + Maps to: -variables-used= + optimize_unused_variables : bool, optional + Assume that if a variable is not referenced in device code, it can be removed. + Default: False + Acceptable value type: bool + Maps to: -optimize-unused-variables + xptxas : List[str], optional + Pass options to PTXAS. This option can be called multiple times. + Default: None + Acceptable value type: list of str + Maps to: -Xptxas= + split_compile : int, optional + Split compilation maximum thread count. Use 0 to use all available processors. Value of 1 disables split + compilation (default). + Default: 1 + Acceptable value type: int + Maps to: -split-compile= + split_compile_extended : int, optional + A more aggressive form of split compilation available in LTO mode only. Accepts a maximum thread count value. + Use 0 to use all available processors. Value of 1 disables extended split compilation (default). Note: This + option can potentially impact performance of the compiled binary. + Default: 1 + Acceptable value type: int + Maps to: -split-compile-extended= + jump_table_density : int, optional + When doing LTO, specify the case density percentage in switch statements, and use it as a minimal threshold to + determine whether jump table (brx.idx instruction) will be used to implement a switch statement. Default value + is 101. The percentage ranges from 0 to 101 inclusively. + Default: 101 + Acceptable value type: int + Maps to: -jump-table-density= + no_cache : bool, optional + Do not cache the intermediate steps of nvJitLink. + Default: False + Acceptable value type: bool + Maps to: -no-cache + device_stack_protector : bool, optional + Enable stack canaries in device code. Stack canaries make it more difficult to exploit certain types of memory + safety bugs involving stack-local variables. The compiler uses heuristics to assess the risk of such a bug in + each function. Only those functions which are deemed high-risk make use of a stack canary. + Default: False + Acceptable value type: bool + Maps to: -device-stack-protector + """ + + arch: str + max_register_count: Optional[int] = None + time: Optional[bool] = None + verbose: Optional[bool] = None + link_time_optimization: Optional[bool] = None + ptx: Optional[bool] = None + optimization_level: Optional[int] = None + debug: Optional[bool] = None + lineinfo: Optional[bool] = None + ftz: Optional[bool] = None + prec_div: Optional[bool] = None + prec_sqrt: Optional[bool] = None + fma: Optional[bool] = None + kernels_used: Optional[List[str]] = None + variables_used: Optional[List[str]] = None + optimize_unused_variables: Optional[bool] = None + xptxas: Optional[List[str]] = None + split_compile: Optional[int] = None + split_compile_extended: Optional[int] = None + jump_table_density: Optional[int] = None + no_cache: Optional[bool] = None + device_stack_protector: Optional[bool] = None + + def __post_init__(self): + self.formatted_options = [] + if self.arch is not None: + self.formatted_options.append(f"-arch={self.arch}") + if self.max_register_count is not None: + self.formatted_options.append(f"-maxrregcount={self.max_register_count}") + if self.time is not None: + self.formatted_options.append("-time") + if self.verbose is not None: + self.formatted_options.append("-verbose") + if self.link_time_optimization is not None: + self.formatted_options.append("-lto") + if self.ptx is not None: + self.formatted_options.append("-ptx") + if self.optimization_level is not None: + self.formatted_options.append(f"-O{self.optimization_level}") + if self.debug is not None: + self.formatted_options.append("-g") + if self.lineinfo is not None: + self.formatted_options.append("-lineinfo") + if self.ftz is not None: + self.formatted_options.append(f"-ftz={'true' if self.ftz else 'false'}") + if self.prec_div is not None: + self.formatted_options.append(f"-prec-div={'true' if self.prec_div else 'false'}") + if self.prec_sqrt is not None: + self.formatted_options.append(f"-prec-sqrt={'true' if self.prec_sqrt else 'false'}") + if self.fma is not None: + self.formatted_options.append(f"-fma={'true' if self.fma else 'false'}") + if self.kernels_used is not None: + for kernel in self.kernels_used: + self.formatted_options.append(f"-kernels-used={kernel}") + if self.variables_used is not None: + for variable in self.variables_used: + self.formatted_options.append(f"-variables-used={variable}") + if self.optimize_unused_variables is not None: + self.formatted_options.append("-optimize-unused-variables") + if self.xptxas is not None: + for opt in self.xptxas: + self.formatted_options.append(f"-Xptxas={opt}") + if self.split_compile is not None: + self.formatted_options.append(f"-split-compile={self.split_compile}") + if self.split_compile_extended is not None: + self.formatted_options.append(f"-split-compile-extended={self.split_compile_extended}") + if self.jump_table_density is not None: + self.formatted_options.append(f"-jump-table-density={self.jump_table_density}") + if self.no_cache is not None: + self.formatted_options.append("-no-cache") + if self.device_stack_protector is not None: + self.formatted_options.append("-device-stack-protector") + + +class Linker: + __slots__ = "_handle" + + def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): + self._handle = None + options = check_or_create_options(LinkerOptions, options, "Linker options") + self._handle = nvjitlink.create(len(options.formatted_options), options.formatted_options) + + if object_codes is not None: + for code in object_codes: + assert isinstance(code, ObjectCode) + self._add_code_object(code) + + def _add_code_object(self, object_code: ObjectCode): + data = object_code._module + assert isinstance(data, bytes) + nvjitlink.add_data( + self._handle, + self._input_type_from_code_type(object_code._code_type), + data, + len(data), + f"{object_code._handle}_{object_code._code_type}", + ) + + def link(self, target_type) -> ObjectCode: + nvjitlink.complete(self._handle) + if target_type not in ["cubin", "ptx"]: + raise ValueError(f"Unsupported target type: {target_type}") + code = None + if target_type == "cubin": + cubin_size = nvjitlink.get_linked_cubin_size(self._handle) + code = bytearray(cubin_size) + nvjitlink.get_linked_cubin(self._handle, code) + else: + ptx_size = nvjitlink.get_linked_ptx_size(self._handle) + code = bytearray(ptx_size) + nvjitlink.get_linked_ptx(self._handle, code) + + return ObjectCode(bytes(code), target_type) + + def get_error_log(self) -> str: + log_size = nvjitlink.get_error_log_size(self._handle) + log = bytearray(log_size) + nvjitlink.get_error_log(self._handle, log) + return log.decode() + + def get_info_log(self) -> str: + log_size = nvjitlink.get_info_log_size(self._handle) + log = bytearray(log_size) + nvjitlink.get_info_log(self._handle, log) + return log.decode() + + def _input_type_from_code_type(self, code_type: str) -> nvjitlink.InputType: + # this list is based on the supported values for code_type in the ObjectCode class definition. + # nvjitlink supports other options for input type + if code_type == "ptx": + return nvjitlink.InputType.PTX + elif code_type == "cubin": + return nvjitlink.InputType.CUBIN + elif code_type == "fatbin": + return nvjitlink.InputType.FATBIN + elif code_type == "ltoir": + return nvjitlink.InputType.LTOIR + elif code_type == "object": + return nvjitlink.InputType.OBJECT + else: + raise ValueError(f"Unknown code_type associated with ObjectCode: {code_type}") + + @property + def handle(self) -> int: + return self._handle + + def __del__(self): + if self._handle is not None: + nvjitlink.destroy(self._handle) + self._handle = None diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 1cb9811b..e10b36a8 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -31,3 +31,8 @@ CUDA compilation toolchain :toctree: generated/ Program + Linker + + :template: dataclass.rst + + LinkerOptions \ No newline at end of file diff --git a/cuda_core/docs/source/release.md b/cuda_core/docs/source/release.md index 48e24786..4c615eb3 100644 --- a/cuda_core/docs/source/release.md +++ b/cuda_core/docs/source/release.md @@ -6,4 +6,5 @@ maxdepth: 3 --- 0.1.0 + 0.2.0 ``` diff --git a/cuda_core/docs/source/release/0.1.0-notes.md b/cuda_core/docs/source/release/0.1.0-notes.md index 2131ed90..1ebb41f9 100644 --- a/cuda_core/docs/source/release/0.1.0-notes.md +++ b/cuda_core/docs/source/release/0.1.0-notes.md @@ -1,9 +1,9 @@ # `cuda.core` Release notes -Released on Nov 8, 2024 +Released on Nov XX, 2024 ## Hightlights -- Initial beta release +- Initial EA1 (early access) release - Supports all platforms that CUDA is supported - Supports all CUDA 11.x/12.x drivers - Supports all CUDA 11.x/12.x Toolkits diff --git a/cuda_core/docs/source/release/0.2.0-notes.md b/cuda_core/docs/source/release/0.2.0-notes.md new file mode 100644 index 00000000..1a047511 --- /dev/null +++ b/cuda_core/docs/source/release/0.2.0-notes.md @@ -0,0 +1,11 @@ +# `cuda.core` Release notes + +Released on Nov , 2024 + +## Hightlights +- Addition of the Linker class which gives object oriented and pythonic access to the nvJitLink API. + +## Limitations + +-The Linker class only supports cuda >=12. For cuda <12, use low level cuLink API. + diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py new file mode 100644 index 00000000..6011bf4f --- /dev/null +++ b/cuda_core/tests/test_linker.py @@ -0,0 +1,101 @@ +import pytest + +from cuda.core.experimental._linker import Linker, LinkerOptions +from cuda.core.experimental._module import ObjectCode +from cuda.core.experimental._program import Program + +ARCH = "sm_80" # use sm_80 for testing the oop nvJitLink wrapper +empty_entrypoint_kernel = "__global__ void A() {}" +empty_kernel = "__device__ void B() {}" +addition_kernel = "__device__ int C(int a, int b) { return a + b; }" + + +@pytest.fixture(scope="module") +def compile_ptx_functions(init_cuda): + object_code_a_ptx = Program(empty_entrypoint_kernel, "c++").compile("ptx") + object_code_b_ptx = Program(empty_kernel, "c++").compile("ptx") + object_code_c_ptx = Program(addition_kernel, "c++").compile("ptx") + + return object_code_a_ptx, object_code_b_ptx, object_code_c_ptx + + +@pytest.fixture(scope="module") +def compile_ltoir_functions(init_cuda): + object_code_a_ltoir = Program(empty_entrypoint_kernel, "c++").compile("ltoir", options=("-dlto",)) + object_code_b_ltoir = Program(empty_kernel, "c++").compile("ltoir", options=("-dlto",)) + object_code_c_ltoir = Program(addition_kernel, "c++").compile("ltoir", options=("-dlto",)) + + return object_code_a_ltoir, object_code_b_ltoir, object_code_c_ltoir + + +@pytest.mark.parametrize( + "options", + [ + LinkerOptions(arch=ARCH), + LinkerOptions(arch=ARCH, max_register_count=32), + LinkerOptions(arch=ARCH, time=True), + LinkerOptions(arch=ARCH, verbose=True), + LinkerOptions(arch=ARCH, optimization_level=3), + LinkerOptions(arch=ARCH, debug=True), + LinkerOptions(arch=ARCH, lineinfo=True), + LinkerOptions(arch=ARCH, ftz=True), + LinkerOptions(arch=ARCH, prec_div=True), + LinkerOptions(arch=ARCH, prec_sqrt=True), + LinkerOptions(arch=ARCH, fma=True), + LinkerOptions(arch=ARCH, kernels_used=["kernel1"]), + LinkerOptions(arch=ARCH, variables_used=["var1"]), + LinkerOptions(arch=ARCH, optimize_unused_variables=True), + LinkerOptions(arch=ARCH, xptxas=["-v"]), + LinkerOptions(arch=ARCH, split_compile=0), + LinkerOptions(arch=ARCH, split_compile_extended=1), + LinkerOptions(arch=ARCH, jump_table_density=100), + LinkerOptions(arch=ARCH, no_cache=True), + ], +) +def test_linker_init(compile_ptx_functions, options): + linker = Linker(*compile_ptx_functions, options=options) + object_code = linker.link("cubin") + assert isinstance(object_code, ObjectCode) + + +def test_linker_init_invalid_arch(): + options = LinkerOptions(arch=None) + with pytest.raises(TypeError): + Linker(options) + + +def test_linker_link_ptx(compile_ltoir_functions): + options = LinkerOptions(arch=ARCH, link_time_optimization=True, ptx=True) + linker = Linker(*compile_ltoir_functions, options=options) + linked_code = linker.link("ptx") + assert isinstance(linked_code, ObjectCode) + + +def test_linker_link_cubin(compile_ptx_functions): + options = LinkerOptions(arch=ARCH) + linker = Linker(*compile_ptx_functions, options=options) + linked_code = linker.link("cubin") + assert isinstance(linked_code, ObjectCode) + + +def test_linker_link_invalid_target_type(compile_ptx_functions): + options = LinkerOptions(arch=ARCH) + linker = Linker(*compile_ptx_functions, options=options) + with pytest.raises(ValueError): + linker.link("invalid_target") + + +def test_linker_get_error_log(compile_ptx_functions): + options = LinkerOptions(arch=ARCH) + linker = Linker(*compile_ptx_functions, options=options) + linker.link("cubin") + log = linker.get_error_log() + assert isinstance(log, str) + + +def test_linker_get_info_log(compile_ptx_functions): + options = LinkerOptions(arch=ARCH) + linker = Linker(*compile_ptx_functions, options=options) + linker.link("cubin") + log = linker.get_info_log() + assert isinstance(log, str) From 1b5f01974d92e2fef030ecc9e1da701ae221cd30 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 27 Nov 2024 13:09:37 -0800 Subject: [PATCH 02/29] fix commit --- cuda_core/cuda/core/experimental/_linker.py | 26 ++++++++++----------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index e80bfe61..3a47b439 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -1,3 +1,4 @@ +import weakref from dataclasses import dataclass from typing import List, Optional @@ -152,9 +153,7 @@ class LinkerOptions: xptxas: Optional[List[str]] = None split_compile: Optional[int] = None split_compile_extended: Optional[int] = None - jump_table_density: Optional[int] = None no_cache: Optional[bool] = None - device_stack_protector: Optional[bool] = None def __post_init__(self): self.formatted_options = [] @@ -199,26 +198,25 @@ def __post_init__(self): self.formatted_options.append(f"-split-compile={self.split_compile}") if self.split_compile_extended is not None: self.formatted_options.append(f"-split-compile-extended={self.split_compile_extended}") - if self.jump_table_density is not None: - self.formatted_options.append(f"-jump-table-density={self.jump_table_density}") if self.no_cache is not None: self.formatted_options.append("-no-cache") - if self.device_stack_protector is not None: - self.formatted_options.append("-device-stack-protector") class Linker: - __slots__ = "_handle" + __slots__ = ("__weakref__", "_handle", "_options") def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): - self._handle = None options = check_or_create_options(LinkerOptions, options, "Linker options") self._handle = nvjitlink.create(len(options.formatted_options), options.formatted_options) - if object_codes is not None: - for code in object_codes: - assert isinstance(code, ObjectCode) - self._add_code_object(code) + if len(object_codes) == 0: + raise ValueError("At least one ObjectCode object must be provided") + + for code in object_codes: + assert isinstance(code, ObjectCode) + self._add_code_object(code) + + weakref.finalize(self, self.close) def _add_code_object(self, object_code: ObjectCode): data = object_code._module @@ -233,7 +231,7 @@ def _add_code_object(self, object_code: ObjectCode): def link(self, target_type) -> ObjectCode: nvjitlink.complete(self._handle) - if target_type not in ["cubin", "ptx"]: + if target_type not in ("cubin", "ptx"): raise ValueError(f"Unsupported target type: {target_type}") code = None if target_type == "cubin": @@ -279,7 +277,7 @@ def _input_type_from_code_type(self, code_type: str) -> nvjitlink.InputType: def handle(self) -> int: return self._handle - def __del__(self): + def close(self): if self._handle is not None: nvjitlink.destroy(self._handle) self._handle = None From 58ce68f06841ebaae4bb6c4789c68fb8a16ec1e6 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 27 Nov 2024 13:10:42 -0800 Subject: [PATCH 03/29] fix commit --- cuda_core/cuda/core/experimental/_linker.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index 3a47b439..518c48d3 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -113,25 +113,11 @@ class LinkerOptions: Default: 1 Acceptable value type: int Maps to: -split-compile-extended= - jump_table_density : int, optional - When doing LTO, specify the case density percentage in switch statements, and use it as a minimal threshold to - determine whether jump table (brx.idx instruction) will be used to implement a switch statement. Default value - is 101. The percentage ranges from 0 to 101 inclusively. - Default: 101 - Acceptable value type: int - Maps to: -jump-table-density= no_cache : bool, optional Do not cache the intermediate steps of nvJitLink. Default: False Acceptable value type: bool Maps to: -no-cache - device_stack_protector : bool, optional - Enable stack canaries in device code. Stack canaries make it more difficult to exploit certain types of memory - safety bugs involving stack-local variables. The compiler uses heuristics to assess the risk of such a bug in - each function. Only those functions which are deemed high-risk make use of a stack canary. - Default: False - Acceptable value type: bool - Maps to: -device-stack-protector """ arch: str From ce8a47233786466d2e4d7335e518e0070dcf86ea Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 27 Nov 2024 13:12:14 -0800 Subject: [PATCH 04/29] keep self._options for debugging --- cuda_core/cuda/core/experimental/_linker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index 518c48d3..cf4c6ccd 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -192,7 +192,7 @@ class Linker: __slots__ = ("__weakref__", "_handle", "_options") def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): - options = check_or_create_options(LinkerOptions, options, "Linker options") + self._options = options = check_or_create_options(LinkerOptions, options, "Linker options") self._handle = nvjitlink.create(len(options.formatted_options), options.formatted_options) if len(object_codes) == 0: From ab35b373ddda7b4177853d1c348a3b6027fb391f Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 27 Nov 2024 13:13:41 -0800 Subject: [PATCH 05/29] revert release notes change --- cuda_core/docs/source/release/0.1.0-notes.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/docs/source/release/0.1.0-notes.md b/cuda_core/docs/source/release/0.1.0-notes.md index 1ebb41f9..2131ed90 100644 --- a/cuda_core/docs/source/release/0.1.0-notes.md +++ b/cuda_core/docs/source/release/0.1.0-notes.md @@ -1,9 +1,9 @@ # `cuda.core` Release notes -Released on Nov XX, 2024 +Released on Nov 8, 2024 ## Hightlights -- Initial EA1 (early access) release +- Initial beta release - Supports all platforms that CUDA is supported - Supports all CUDA 11.x/12.x drivers - Supports all CUDA 11.x/12.x Toolkits From b82591fc70adb26023ddaf1ddc0fb2e5c4881b4c Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 27 Nov 2024 13:14:31 -0800 Subject: [PATCH 06/29] update linker test --- cuda_core/tests/test_linker.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 6011bf4f..2dfac375 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -1,8 +1,7 @@ import pytest -from cuda.core.experimental._linker import Linker, LinkerOptions +from cuda.core.experimental import Linker, LinkerOptions, Program from cuda.core.experimental._module import ObjectCode -from cuda.core.experimental._program import Program ARCH = "sm_80" # use sm_80 for testing the oop nvJitLink wrapper empty_entrypoint_kernel = "__global__ void A() {}" From 265ba01c7ef586177afb877e0f2bbea42c80528d Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 27 Nov 2024 13:26:20 -0800 Subject: [PATCH 07/29] update the test --- cuda_core/tests/test_linker.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 2dfac375..7db6ed9f 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -9,7 +9,7 @@ addition_kernel = "__device__ int C(int a, int b) { return a + b; }" -@pytest.fixture(scope="module") +@pytest.fixture(scope="function") def compile_ptx_functions(init_cuda): object_code_a_ptx = Program(empty_entrypoint_kernel, "c++").compile("ptx") object_code_b_ptx = Program(empty_kernel, "c++").compile("ptx") @@ -18,7 +18,7 @@ def compile_ptx_functions(init_cuda): return object_code_a_ptx, object_code_b_ptx, object_code_c_ptx -@pytest.fixture(scope="module") +@pytest.fixture(scope="function") def compile_ltoir_functions(init_cuda): object_code_a_ltoir = Program(empty_entrypoint_kernel, "c++").compile("ltoir", options=("-dlto",)) object_code_b_ltoir = Program(empty_kernel, "c++").compile("ltoir", options=("-dlto",)) @@ -47,7 +47,6 @@ def compile_ltoir_functions(init_cuda): LinkerOptions(arch=ARCH, xptxas=["-v"]), LinkerOptions(arch=ARCH, split_compile=0), LinkerOptions(arch=ARCH, split_compile_extended=1), - LinkerOptions(arch=ARCH, jump_table_density=100), LinkerOptions(arch=ARCH, no_cache=True), ], ) From c8a8dcb0a682ab754e1d036c68dc312a0b97608d Mon Sep 17 00:00:00 2001 From: ksimpson Date: Mon, 2 Dec 2024 13:24:50 -0800 Subject: [PATCH 08/29] save --- cuda_core/cuda/core/experimental/_linker.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index cf4c6ccd..d7dd273c 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -1,3 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + import weakref from dataclasses import dataclass from typing import List, Optional From e9661895fff2a5d928be73b521aee30e4960935e Mon Sep 17 00:00:00 2001 From: ksimpson Date: Mon, 2 Dec 2024 13:33:31 -0800 Subject: [PATCH 09/29] add docstring, copyright header, and switch finalizer pattern --- cuda_core/cuda/core/experimental/_linker.py | 68 ++++++++++++++++----- 1 file changed, 52 insertions(+), 16 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index d7dd273c..1a99f355 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -193,11 +193,49 @@ def __post_init__(self): class Linker: - __slots__ = ("__weakref__", "_handle", "_options") + """ + Linker class for managing the linking of object codes with specified options. + + Parameters + ---------- + object_codes : ObjectCode + One or more ObjectCode objects to be linked. + options : LinkerOptions, optional + Options for the linker. If not provided, default options will be used. + + Attributes + ---------- + _options : LinkerOptions + The options used for the linker. + _handle : handle + The handle to the linker created by nvjitlink. + + Methods + ------- + _add_code_object(object_code) + Adds an object code to the linker. + close() + Closes the linker and releases resources. + """ + + class _MembersNeededForFinalize: + __slots__ = ("handle",) + + def __init__(self, program_obj, handle): + self.handle = handle + weakref.finalize(program_obj, self.close) + + def close(self): + if self.handle is not None: + nvjitlink.destroy(self.handle) + self.handle = None + + __slots__ = ("__weakref__", "_mnff", "_options") def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): self._options = options = check_or_create_options(LinkerOptions, options, "Linker options") - self._handle = nvjitlink.create(len(options.formatted_options), options.formatted_options) + self._mnff.handle = nvjitlink.create(len(options.formatted_options), options.formatted_options) + self._mnff = Linker._MembersNeededForFinalize(self, None) if len(object_codes) == 0: raise ValueError("At least one ObjectCode object must be provided") @@ -212,7 +250,7 @@ def _add_code_object(self, object_code: ObjectCode): data = object_code._module assert isinstance(data, bytes) nvjitlink.add_data( - self._handle, + self._mnff.handle, self._input_type_from_code_type(object_code._code_type), data, len(data), @@ -220,31 +258,31 @@ def _add_code_object(self, object_code: ObjectCode): ) def link(self, target_type) -> ObjectCode: - nvjitlink.complete(self._handle) + nvjitlink.complete(self._mnff.handle) if target_type not in ("cubin", "ptx"): raise ValueError(f"Unsupported target type: {target_type}") code = None if target_type == "cubin": - cubin_size = nvjitlink.get_linked_cubin_size(self._handle) + cubin_size = nvjitlink.get_linked_cubin_size(self._mnff.handle) code = bytearray(cubin_size) - nvjitlink.get_linked_cubin(self._handle, code) + nvjitlink.get_linked_cubin(self._mnff.handle, code) else: - ptx_size = nvjitlink.get_linked_ptx_size(self._handle) + ptx_size = nvjitlink.get_linked_ptx_size(self._mnff.handle) code = bytearray(ptx_size) - nvjitlink.get_linked_ptx(self._handle, code) + nvjitlink.get_linked_ptx(self._mnff.handle, code) return ObjectCode(bytes(code), target_type) def get_error_log(self) -> str: - log_size = nvjitlink.get_error_log_size(self._handle) + log_size = nvjitlink.get_error_log_size(self._mnff.handle) log = bytearray(log_size) - nvjitlink.get_error_log(self._handle, log) + nvjitlink.get_error_log(self._mnff.handle, log) return log.decode() def get_info_log(self) -> str: - log_size = nvjitlink.get_info_log_size(self._handle) + log_size = nvjitlink.get_info_log_size(self._mnff.handle) log = bytearray(log_size) - nvjitlink.get_info_log(self._handle, log) + nvjitlink.get_info_log(self._mnff.handle, log) return log.decode() def _input_type_from_code_type(self, code_type: str) -> nvjitlink.InputType: @@ -265,9 +303,7 @@ def _input_type_from_code_type(self, code_type: str) -> nvjitlink.InputType: @property def handle(self) -> int: - return self._handle + return self._mnff.handle def close(self): - if self._handle is not None: - nvjitlink.destroy(self._handle) - self._handle = None + self._mnff.close() From 17c3e106ef82d8a5dcc8bae0c2c8ea484ccc2dda Mon Sep 17 00:00:00 2001 From: ksimpson Date: Tue, 3 Dec 2024 09:15:46 -0800 Subject: [PATCH 10/29] address comments --- cuda_core/cuda/core/experimental/_linker.py | 52 +++++++++++---------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index 1a99f355..bb66adde 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -234,8 +234,9 @@ def close(self): def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): self._options = options = check_or_create_options(LinkerOptions, options, "Linker options") - self._mnff.handle = nvjitlink.create(len(options.formatted_options), options.formatted_options) - self._mnff = Linker._MembersNeededForFinalize(self, None) + self._mnff = Linker._MembersNeededForFinalize( + self, nvjitlink.create(len(options.formatted_options), options.formatted_options) + ) if len(object_codes) == 0: raise ValueError("At least one ObjectCode object must be provided") @@ -244,8 +245,6 @@ def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): assert isinstance(code, ObjectCode) self._add_code_object(code) - weakref.finalize(self, self.close) - def _add_code_object(self, object_code: ObjectCode): data = object_code._module assert isinstance(data, bytes) @@ -257,19 +256,21 @@ def _add_code_object(self, object_code: ObjectCode): f"{object_code._handle}_{object_code._code_type}", ) + _get_linked_methods = { + "cubin": (nvjitlink.get_linked_cubin_size, nvjitlink.get_linked_cubin), + "ptx": (nvjitlink.get_linked_ptx_size, nvjitlink.get_linked_ptx), + } + def link(self, target_type) -> ObjectCode: nvjitlink.complete(self._mnff.handle) - if target_type not in ("cubin", "ptx"): + get_linked = self._get_linked_methods.get(target_type) + if get_linked is None: raise ValueError(f"Unsupported target type: {target_type}") - code = None - if target_type == "cubin": - cubin_size = nvjitlink.get_linked_cubin_size(self._mnff.handle) - code = bytearray(cubin_size) - nvjitlink.get_linked_cubin(self._mnff.handle, code) - else: - ptx_size = nvjitlink.get_linked_ptx_size(self._mnff.handle) - code = bytearray(ptx_size) - nvjitlink.get_linked_ptx(self._mnff.handle, code) + + get_size, get_code = get_linked + size = get_size(self._mnff.handle) + code = bytearray(size) + get_code(self._mnff.handle, code) return ObjectCode(bytes(code), target_type) @@ -285,21 +286,22 @@ def get_info_log(self) -> str: nvjitlink.get_info_log(self._mnff.handle, log) return log.decode() + _input_types = { + "ptx": nvjitlink.InputType.PTX, + "cubin": nvjitlink.InputType.CUBIN, + "fatbin": nvjitlink.InputType.FATBIN, + "ltoir": nvjitlink.InputType.LTOIR, + "object": nvjitlink.InputType.OBJECT, + } + def _input_type_from_code_type(self, code_type: str) -> nvjitlink.InputType: # this list is based on the supported values for code_type in the ObjectCode class definition. # nvjitlink supports other options for input type - if code_type == "ptx": - return nvjitlink.InputType.PTX - elif code_type == "cubin": - return nvjitlink.InputType.CUBIN - elif code_type == "fatbin": - return nvjitlink.InputType.FATBIN - elif code_type == "ltoir": - return nvjitlink.InputType.LTOIR - elif code_type == "object": - return nvjitlink.InputType.OBJECT - else: + input_type = self._input_types.get(code_type) + + if input_type is None: raise ValueError(f"Unknown code_type associated with ObjectCode: {code_type}") + return input_type @property def handle(self) -> int: From 7f846263d9feffe601948eb0b82b3668b6855713 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Tue, 3 Dec 2024 09:18:44 -0800 Subject: [PATCH 11/29] rename release notes --- cuda_core/docs/source/release/{0.2.0-notes.md => 0.1.1-notes.md} | 1 - 1 file changed, 1 deletion(-) rename cuda_core/docs/source/release/{0.2.0-notes.md => 0.1.1-notes.md} (93%) diff --git a/cuda_core/docs/source/release/0.2.0-notes.md b/cuda_core/docs/source/release/0.1.1-notes.md similarity index 93% rename from cuda_core/docs/source/release/0.2.0-notes.md rename to cuda_core/docs/source/release/0.1.1-notes.md index 1a047511..0dbd49ce 100644 --- a/cuda_core/docs/source/release/0.2.0-notes.md +++ b/cuda_core/docs/source/release/0.1.1-notes.md @@ -8,4 +8,3 @@ Released on Nov , 2024 ## Limitations -The Linker class only supports cuda >=12. For cuda <12, use low level cuLink API. - From 5207558076d366abf483e72daedc7fd6dce378e6 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Tue, 3 Dec 2024 09:42:12 -0800 Subject: [PATCH 12/29] rename release notes --- cuda_core/docs/source/release.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/docs/source/release.md b/cuda_core/docs/source/release.md index 4c615eb3..55090b0b 100644 --- a/cuda_core/docs/source/release.md +++ b/cuda_core/docs/source/release.md @@ -5,6 +5,6 @@ maxdepth: 3 --- + 0.1.1 0.1.0 - 0.2.0 ``` From 14b9c6766160bcb23227bc303117d9137f8569e0 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Tue, 3 Dec 2024 10:53:20 -0800 Subject: [PATCH 13/29] fix the test to not use a global function, which was causing swallowed link errors --- cuda_core/tests/test_linker.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 7db6ed9f..1cb444fb 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -4,15 +4,15 @@ from cuda.core.experimental._module import ObjectCode ARCH = "sm_80" # use sm_80 for testing the oop nvJitLink wrapper -empty_entrypoint_kernel = "__global__ void A() {}" -empty_kernel = "__device__ void B() {}" +empty_kernel = "__device__ void A() {}" +basic_kernel = "__device__ int B() { return 0; }" addition_kernel = "__device__ int C(int a, int b) { return a + b; }" @pytest.fixture(scope="function") def compile_ptx_functions(init_cuda): - object_code_a_ptx = Program(empty_entrypoint_kernel, "c++").compile("ptx") - object_code_b_ptx = Program(empty_kernel, "c++").compile("ptx") + object_code_a_ptx = Program(empty_kernel, "c++").compile("ptx") + object_code_b_ptx = Program(basic_kernel, "c++").compile("ptx") object_code_c_ptx = Program(addition_kernel, "c++").compile("ptx") return object_code_a_ptx, object_code_b_ptx, object_code_c_ptx @@ -20,8 +20,8 @@ def compile_ptx_functions(init_cuda): @pytest.fixture(scope="function") def compile_ltoir_functions(init_cuda): - object_code_a_ltoir = Program(empty_entrypoint_kernel, "c++").compile("ltoir", options=("-dlto",)) - object_code_b_ltoir = Program(empty_kernel, "c++").compile("ltoir", options=("-dlto",)) + object_code_a_ltoir = Program(empty_kernel, "c++").compile("ltoir", options=("-dlto",)) + object_code_b_ltoir = Program(basic_kernel, "c++").compile("ltoir", options=("-dlto",)) object_code_c_ltoir = Program(addition_kernel, "c++").compile("ltoir", options=("-dlto",)) return object_code_a_ltoir, object_code_b_ltoir, object_code_c_ltoir From a7f8c309ad84245b26333062c473baf5326ae191 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 4 Dec 2024 01:14:28 +0000 Subject: [PATCH 14/29] WIP: enable cuLink APIs from driver --- cuda_core/cuda/core/experimental/_linker.py | 253 +++++++++++++++----- cuda_core/tests/test_linker.py | 32 +-- 2 files changed, 209 insertions(+), 76 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index bb66adde..57a10866 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -2,13 +2,64 @@ # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +import ctypes import weakref from dataclasses import dataclass from typing import List, Optional -from cuda.bindings import nvjitlink +from cuda import cuda from cuda.core.experimental._module import ObjectCode -from cuda.core.experimental._utils import check_or_create_options +from cuda.core.experimental._utils import check_or_create_options, handle_return + +# TODO: revisit this treatment for py313t builds +_driver = None # populated if nvJitLink cannot be used +_driver_input_types = None # populated if nvJitLink cannot be used +_driver_ver = None +_inited = False +_nvjitlink = None # populated if nvJitLink can be used +_nvjitlink_input_types = None # populated if nvJitLink cannot be used + + +def _lazy_init(): + global _inited + if _inited: + return + + global _driver, _driver_input_types, _driver_ver, _nvjitlink, _nvjitlink_input_types + _driver_ver = handle_return(cuda.cuDriverGetVersion()) + _driver_ver = (_driver_ver // 1000, (_driver_ver % 1000) // 10) + try: + from cuda.bindings import nvjitlink + from cuda.bindings._internal import nvjitlink as inner_nvjitlink + except ImportError: + # binding is not available + nvjitlink = None + else: + if inner_nvjitlink._inspect_function_pointer("__nvJitLinkVersion") == 0: + # binding is available, but nvJitLink is not installed + nvjitlink = None + elif _driver_ver > nvjitlink.version(): + # TODO: nvJitLink is not new enough, warn? + pass + if nvjitlink: + _nvjitlink = nvjitlink + _nvjitlink_input_types = { + "ptx": _nvjitlink.InputType.PTX, + "cubin": _nvjitlink.InputType.CUBIN, + "fatbin": _nvjitlink.InputType.FATBIN, + "ltoir": _nvjitlink.InputType.LTOIR, + "object": _nvjitlink.InputType.OBJECT, + } + else: + from cuda import cuda as _driver + + _driver_input_types = { + "ptx": _driver.CUjitInputType.CU_JIT_INPUT_PTX, + "cubin": _driver.CUjitInputType.CU_JIT_INPUT_CUBIN, + "fatbin": _driver.CUjitInputType.CU_JIT_INPUT_FATBINARY, + "object": _driver.CUjitInputType.CU_JIT_INPUT_OBJECT, + } + _inited = True @dataclass @@ -146,7 +197,14 @@ class LinkerOptions: no_cache: Optional[bool] = None def __post_init__(self): + _lazy_init() self.formatted_options = [] + if _nvjitlink: + self._init_nvjitlink() + else: + self._init_driver() + + def _init_nvjitlink(self): if self.arch is not None: self.formatted_options.append(f"-arch={self.arch}") if self.max_register_count is not None: @@ -191,6 +249,67 @@ def __post_init__(self): if self.no_cache is not None: self.formatted_options.append("-no-cache") + def _init_driver(self): + self.option_keys = [] + # allocate 4 KiB each for info/error logs + size = 4194304 + self.formatted_options.extend((bytearray(size), size, bytearray(size), size)) + self.option_keys.extend( + ( + _driver.CUjit_option.CU_JIT_INFO_LOG_BUFFER, + _driver.CUjit_option.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, + _driver.CUjit_option.CU_JIT_ERROR_LOG_BUFFER, + _driver.CUjit_option.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, + ) + ) + + if self.arch is not None: + arch = self.arch.split("_")[-1].upper() + self.formatted_options.append(getattr(_driver.CUjit_target, f"CU_TARGET_COMPUTE_{arch}")) + self.option_keys.append(_driver.CUjit_option.CU_JIT_TARGET) + # if self.max_register_count is not None: + # self.formatted_options.append(f"-maxrregcount={self.max_register_count}") + # if self.time is not None: + # self.formatted_options.append("-time") + if self.verbose is not None: + self.formatted_options.append(1) # ctypes.c_int32(1)) + self.option_keys.append(_driver.CUjit_option.CU_JIT_LOG_VERBOSE) + # if self.link_time_optimization is not None: + # self.formatted_options.append("-lto") + # if self.ptx is not None: + # self.formatted_options.append("-ptx") + # if self.optimization_level is not None: + # self.formatted_options.append(f"-O{self.optimization_level}") + # if self.debug is not None: + # self.formatted_options.append("-g") + # if self.lineinfo is not None: + # self.formatted_options.append("-lineinfo") + # if self.ftz is not None: + # self.formatted_options.append(f"-ftz={'true' if self.ftz else 'false'}") + # if self.prec_div is not None: + # self.formatted_options.append(f"-prec-div={'true' if self.prec_div else 'false'}") + # if self.prec_sqrt is not None: + # self.formatted_options.append(f"-prec-sqrt={'true' if self.prec_sqrt else 'false'}") + # if self.fma is not None: + # self.formatted_options.append(f"-fma={'true' if self.fma else 'false'}") + # if self.kernels_used is not None: + # for kernel in self.kernels_used: + # self.formatted_options.append(f"-kernels-used={kernel}") + # if self.variables_used is not None: + # for variable in self.variables_used: + # self.formatted_options.append(f"-variables-used={variable}") + # if self.optimize_unused_variables is not None: + # self.formatted_options.append("-optimize-unused-variables") + # if self.xptxas is not None: + # for opt in self.xptxas: + # self.formatted_options.append(f"-Xptxas={opt}") + # if self.split_compile is not None: + # self.formatted_options.append(f"-split-compile={self.split_compile}") + # if self.split_compile_extended is not None: + # self.formatted_options.append(f"-split-compile-extended={self.split_compile_extended}") + # if self.no_cache is not None: + # self.formatted_options.append("-no-cache") + class Linker: """ @@ -202,45 +321,41 @@ class Linker: One or more ObjectCode objects to be linked. options : LinkerOptions, optional Options for the linker. If not provided, default options will be used. - - Attributes - ---------- - _options : LinkerOptions - The options used for the linker. - _handle : handle - The handle to the linker created by nvjitlink. - - Methods - ------- - _add_code_object(object_code) - Adds an object code to the linker. - close() - Closes the linker and releases resources. """ class _MembersNeededForFinalize: - __slots__ = ("handle",) + __slots__ = ("handle", "use_nvjitlink") - def __init__(self, program_obj, handle): + def __init__(self, program_obj, handle, use_nvjitlink): self.handle = handle + self.use_nvjitlink = use_nvjitlink weakref.finalize(program_obj, self.close) def close(self): if self.handle is not None: - nvjitlink.destroy(self.handle) + if self.use_nvjitlink: + _nvjitlink.destroy(self.handle) + else: + handle_return(_driver.cuLinkDestroy(self.handle)) self.handle = None __slots__ = ("__weakref__", "_mnff", "_options") def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): - self._options = options = check_or_create_options(LinkerOptions, options, "Linker options") - self._mnff = Linker._MembersNeededForFinalize( - self, nvjitlink.create(len(options.formatted_options), options.formatted_options) - ) - if len(object_codes) == 0: raise ValueError("At least one ObjectCode object must be provided") + self._options = options = check_or_create_options(LinkerOptions, options, "Linker options") + if _nvjitlink: + handle = _nvjitlink.create(len(options.formatted_options), options.formatted_options) + use_nvjitlink = True + else: + handle = handle_return( + _driver.cuLinkCreate(len(options.formatted_options), options.option_keys, options.formatted_options) + ) + use_nvjitlink = False + self._mnff = Linker._MembersNeededForFinalize(self, handle, use_nvjitlink) + for code in object_codes: assert isinstance(code, ObjectCode) self._add_code_object(code) @@ -248,56 +363,74 @@ def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): def _add_code_object(self, object_code: ObjectCode): data = object_code._module assert isinstance(data, bytes) - nvjitlink.add_data( - self._mnff.handle, - self._input_type_from_code_type(object_code._code_type), - data, - len(data), - f"{object_code._handle}_{object_code._code_type}", - ) - - _get_linked_methods = { - "cubin": (nvjitlink.get_linked_cubin_size, nvjitlink.get_linked_cubin), - "ptx": (nvjitlink.get_linked_ptx_size, nvjitlink.get_linked_ptx), - } + if _nvjitlink: + _nvjitlink.add_data( + self._mnff.handle, + self._input_type_from_code_type(object_code._code_type), + data, + len(data), + f"{object_code._handle}_{object_code._code_type}", + ) + else: + handle_return( + _driver.cuLinkAddData( + self._mnff.handle, + self._input_type_from_code_type(object_code._code_type), + data, + len(data), + f"{object_code._handle}_{object_code._code_type}".encode(), + 0, + None, + None, + ) + ) def link(self, target_type) -> ObjectCode: - nvjitlink.complete(self._mnff.handle) - get_linked = self._get_linked_methods.get(target_type) - if get_linked is None: + if target_type not in ("cubin", "ptx"): raise ValueError(f"Unsupported target type: {target_type}") + if _nvjitlink: + _nvjitlink.complete(self._mnff.handle) + if target_type == "cubin": + get_size = _nvjitlink.get_linked_cubin_size + get_code = _nvjitlink.get_linked_cubin + else: + get_size = _nvjitlink.get_linked_ptx_size + get_code = _nvjitlink.get_linked_ptx - get_size, get_code = get_linked - size = get_size(self._mnff.handle) - code = bytearray(size) - get_code(self._mnff.handle, code) + size = get_size(self._mnff.handle) + code = bytearray(size) + get_code(self._mnff.handle, code) + else: + addr, size = handle_return(_driver.cuLinkComplete(self._mnff.handle)) + code = (ctypes.c_char * size).from_address(addr) return ObjectCode(bytes(code), target_type) def get_error_log(self) -> str: - log_size = nvjitlink.get_error_log_size(self._mnff.handle) - log = bytearray(log_size) - nvjitlink.get_error_log(self._mnff.handle, log) + if _nvjitlink: + log_size = _nvjitlink.get_error_log_size(self._mnff.handle) + log = bytearray(log_size) + _nvjitlink.get_error_log(self._mnff.handle, log) + else: + log = self._options.formatted_options[2] return log.decode() def get_info_log(self) -> str: - log_size = nvjitlink.get_info_log_size(self._mnff.handle) - log = bytearray(log_size) - nvjitlink.get_info_log(self._mnff.handle, log) + if _nvjitlink: + log_size = _nvjitlink.get_info_log_size(self._mnff.handle) + log = bytearray(log_size) + _nvjitlink.get_info_log(self._mnff.handle, log) + else: + log = self._options.formatted_options[0] return log.decode() - _input_types = { - "ptx": nvjitlink.InputType.PTX, - "cubin": nvjitlink.InputType.CUBIN, - "fatbin": nvjitlink.InputType.FATBIN, - "ltoir": nvjitlink.InputType.LTOIR, - "object": nvjitlink.InputType.OBJECT, - } - - def _input_type_from_code_type(self, code_type: str) -> nvjitlink.InputType: + def _input_type_from_code_type(self, code_type: str): # this list is based on the supported values for code_type in the ObjectCode class definition. - # nvjitlink supports other options for input type - input_type = self._input_types.get(code_type) + # nvJitLink/driver support other options for input type + if _nvjitlink: + input_type = _nvjitlink_input_types.get(code_type) + else: + input_type = _driver_input_types.get(code_type) if input_type is None: raise ValueError(f"Unknown code_type associated with ObjectCode: {code_type}") diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 7db6ed9f..4d10f423 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -31,23 +31,23 @@ def compile_ltoir_functions(init_cuda): "options", [ LinkerOptions(arch=ARCH), - LinkerOptions(arch=ARCH, max_register_count=32), - LinkerOptions(arch=ARCH, time=True), + # LinkerOptions(arch=ARCH, max_register_count=32), + # LinkerOptions(arch=ARCH, time=True), LinkerOptions(arch=ARCH, verbose=True), - LinkerOptions(arch=ARCH, optimization_level=3), - LinkerOptions(arch=ARCH, debug=True), - LinkerOptions(arch=ARCH, lineinfo=True), - LinkerOptions(arch=ARCH, ftz=True), - LinkerOptions(arch=ARCH, prec_div=True), - LinkerOptions(arch=ARCH, prec_sqrt=True), - LinkerOptions(arch=ARCH, fma=True), - LinkerOptions(arch=ARCH, kernels_used=["kernel1"]), - LinkerOptions(arch=ARCH, variables_used=["var1"]), - LinkerOptions(arch=ARCH, optimize_unused_variables=True), - LinkerOptions(arch=ARCH, xptxas=["-v"]), - LinkerOptions(arch=ARCH, split_compile=0), - LinkerOptions(arch=ARCH, split_compile_extended=1), - LinkerOptions(arch=ARCH, no_cache=True), + # LinkerOptions(arch=ARCH, optimization_level=3), + # LinkerOptions(arch=ARCH, debug=True), + # LinkerOptions(arch=ARCH, lineinfo=True), + # LinkerOptions(arch=ARCH, ftz=True), + # LinkerOptions(arch=ARCH, prec_div=True), + # LinkerOptions(arch=ARCH, prec_sqrt=True), + # LinkerOptions(arch=ARCH, fma=True), + # LinkerOptions(arch=ARCH, kernels_used=["kernel1"]), + # LinkerOptions(arch=ARCH, variables_used=["var1"]), + # LinkerOptions(arch=ARCH, optimize_unused_variables=True), + # LinkerOptions(arch=ARCH, xptxas=["-v"]), + # LinkerOptions(arch=ARCH, split_compile=0), + # LinkerOptions(arch=ARCH, split_compile_extended=1), + # LinkerOptions(arch=ARCH, no_cache=True), ], ) def test_linker_init(compile_ptx_functions, options): From 028a5c234b4a40e6298ea0e0a4d950013e20ebf5 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Tue, 3 Dec 2024 18:23:31 -0800 Subject: [PATCH 15/29] save progress to remote --- cuda_core/cuda/core/experimental/_linker.py | 104 +++++++++++--------- cuda_core/tests/test_linker.py | 32 +++--- 2 files changed, 76 insertions(+), 60 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index 57a10866..304b3771 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -29,6 +29,7 @@ def _lazy_init(): _driver_ver = handle_return(cuda.cuDriverGetVersion()) _driver_ver = (_driver_ver // 1000, (_driver_ver % 1000) // 10) try: + raise ImportError from cuda.bindings import nvjitlink from cuda.bindings._internal import nvjitlink as inner_nvjitlink except ImportError: @@ -267,48 +268,66 @@ def _init_driver(self): arch = self.arch.split("_")[-1].upper() self.formatted_options.append(getattr(_driver.CUjit_target, f"CU_TARGET_COMPUTE_{arch}")) self.option_keys.append(_driver.CUjit_option.CU_JIT_TARGET) - # if self.max_register_count is not None: - # self.formatted_options.append(f"-maxrregcount={self.max_register_count}") - # if self.time is not None: - # self.formatted_options.append("-time") + if self.max_register_count is not None: + self.formatted_options.append(self.max_register_count) + self.option_keys.append(_driver.CUjit_option.CU_JIT_MAX_REGISTERS) + if self.time is not None: + self.formatted_options.append(1) # ctypes.c_int32(1) + self.option_keys.append(_driver.CUjit_option.CU_JIT_WALL_TIME) if self.verbose is not None: - self.formatted_options.append(1) # ctypes.c_int32(1)) + self.formatted_options.append(1) # ctypes.c_int32(1) self.option_keys.append(_driver.CUjit_option.CU_JIT_LOG_VERBOSE) - # if self.link_time_optimization is not None: - # self.formatted_options.append("-lto") - # if self.ptx is not None: - # self.formatted_options.append("-ptx") - # if self.optimization_level is not None: - # self.formatted_options.append(f"-O{self.optimization_level}") - # if self.debug is not None: - # self.formatted_options.append("-g") - # if self.lineinfo is not None: - # self.formatted_options.append("-lineinfo") - # if self.ftz is not None: - # self.formatted_options.append(f"-ftz={'true' if self.ftz else 'false'}") - # if self.prec_div is not None: - # self.formatted_options.append(f"-prec-div={'true' if self.prec_div else 'false'}") - # if self.prec_sqrt is not None: - # self.formatted_options.append(f"-prec-sqrt={'true' if self.prec_sqrt else 'false'}") - # if self.fma is not None: - # self.formatted_options.append(f"-fma={'true' if self.fma else 'false'}") - # if self.kernels_used is not None: - # for kernel in self.kernels_used: - # self.formatted_options.append(f"-kernels-used={kernel}") - # if self.variables_used is not None: - # for variable in self.variables_used: - # self.formatted_options.append(f"-variables-used={variable}") - # if self.optimize_unused_variables is not None: - # self.formatted_options.append("-optimize-unused-variables") - # if self.xptxas is not None: - # for opt in self.xptxas: - # self.formatted_options.append(f"-Xptxas={opt}") - # if self.split_compile is not None: - # self.formatted_options.append(f"-split-compile={self.split_compile}") - # if self.split_compile_extended is not None: - # self.formatted_options.append(f"-split-compile-extended={self.split_compile_extended}") - # if self.no_cache is not None: - # self.formatted_options.append("-no-cache") + if self.link_time_optimization is not None: + self.formatted_options.append(1) # ctypes.c_int32(1) + self.option_keys.append(_driver.CUjit_option.CU_JIT_LTO) + if self.ptx is not None: + self.formatted_options.append(1) # ctypes.c_int32(1) + self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_LINE_INFO) + if self.optimization_level is not None: + self.formatted_options.append(self.optimization_level) + self.option_keys.append(_driver.CUjit_option.CU_JIT_OPTIMIZATION_LEVEL) + if self.debug is not None: + self.formatted_options.append(1) # ctypes.c_int32(1) + self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_DEBUG_INFO) + if self.lineinfo is not None: + self.formatted_options.append(1) # ctypes.c_int32(1) + self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_LINE_INFO) + if self.ftz is not None: + self.formatted_options.append(1 if self.ftz else 0) + self.option_keys.append(_driver.CUjit_option.CU_JIT_FTZ) + if self.prec_div is not None: + self.formatted_options.append(1 if self.prec_div else 0) + self.option_keys.append(_driver.CUjit_option.CU_JIT_PREC_DIV) + if self.prec_sqrt is not None: + self.formatted_options.append(1 if self.prec_sqrt else 0) + self.option_keys.append(_driver.CUjit_option.CU_JIT_PREC_SQRT) + if self.fma is not None: + self.formatted_options.append(1 if self.fma else 0) + self.option_keys.append(_driver.CUjit_option.CU_JIT_FMA) + if self.kernels_used is not None: + for kernel in self.kernels_used: + self.formatted_options.append(kernel) + self.option_keys.append(_driver.CUjit_option.CU_JIT_REFERENCED_KERNEL_NAMES) + if self.variables_used is not None: + for variable in self.variables_used: + self.formatted_options.append(variable) + self.option_keys.append(_driver.CUjit_option.CU_JIT_REFERENCED_VARIABLE_NAMES) + if self.optimize_unused_variables is not None: + self.formatted_options.append(1) # ctypes.c_int32(1) + self.option_keys.append(_driver.CUjit_option.CU_JIT_OPTIMIZE_UNUSED_DEVICE_VARIABLES) + if self.xptxas is not None: + for opt in self.xptxas: + self.formatted_options.append(opt) + self.option_keys.append(_driver.CUjit_option.CU_JIT_FAST_COMPILE) + if self.split_compile is not None: + self.formatted_options.append(self.split_compile) + self.option_keys.append(_driver.CUjit_option.CU_JIT_THREADS_PER_BLOCK) + if self.split_compile_extended is not None: + self.formatted_options.append(self.split_compile_extended) + self.option_keys.append(_driver.CUjit_option.CU_JIT_MIN_CTA_PER_SM) + if self.no_cache is not None: + self.formatted_options.append(1) # ctypes.c_int32(1) + self.option_keys.append(_driver.CUjit_option.CU_JIT_CACHE_MODE) class Linker: @@ -427,10 +446,7 @@ def get_info_log(self) -> str: def _input_type_from_code_type(self, code_type: str): # this list is based on the supported values for code_type in the ObjectCode class definition. # nvJitLink/driver support other options for input type - if _nvjitlink: - input_type = _nvjitlink_input_types.get(code_type) - else: - input_type = _driver_input_types.get(code_type) + input_type = _nvjitlink_input_types.get(code_type) if _nvjitlink else _driver_input_types.get(code_type) if input_type is None: raise ValueError(f"Unknown code_type associated with ObjectCode: {code_type}") diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index ac7a5012..1851c7ba 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -31,22 +31,22 @@ def compile_ltoir_functions(init_cuda): "options", [ LinkerOptions(arch=ARCH), - # LinkerOptions(arch=ARCH, max_register_count=32), - # LinkerOptions(arch=ARCH, time=True), + LinkerOptions(arch=ARCH, max_register_count=32), + LinkerOptions(arch=ARCH, time=True), LinkerOptions(arch=ARCH, verbose=True), - # LinkerOptions(arch=ARCH, optimization_level=3), - # LinkerOptions(arch=ARCH, debug=True), - # LinkerOptions(arch=ARCH, lineinfo=True), - # LinkerOptions(arch=ARCH, ftz=True), - # LinkerOptions(arch=ARCH, prec_div=True), - # LinkerOptions(arch=ARCH, prec_sqrt=True), - # LinkerOptions(arch=ARCH, fma=True), + LinkerOptions(arch=ARCH, optimization_level=3), + LinkerOptions(arch=ARCH, debug=True), + LinkerOptions(arch=ARCH, lineinfo=True), + LinkerOptions(arch=ARCH, ftz=True), + LinkerOptions(arch=ARCH, prec_div=True), + LinkerOptions(arch=ARCH, prec_sqrt=True), + LinkerOptions(arch=ARCH, fma=True), # LinkerOptions(arch=ARCH, kernels_used=["kernel1"]), # LinkerOptions(arch=ARCH, variables_used=["var1"]), - # LinkerOptions(arch=ARCH, optimize_unused_variables=True), + LinkerOptions(arch=ARCH, optimize_unused_variables=True), # LinkerOptions(arch=ARCH, xptxas=["-v"]), # LinkerOptions(arch=ARCH, split_compile=0), - # LinkerOptions(arch=ARCH, split_compile_extended=1), + LinkerOptions(arch=ARCH, split_compile_extended=1), # LinkerOptions(arch=ARCH, no_cache=True), ], ) @@ -62,11 +62,11 @@ def test_linker_init_invalid_arch(): Linker(options) -def test_linker_link_ptx(compile_ltoir_functions): - options = LinkerOptions(arch=ARCH, link_time_optimization=True, ptx=True) - linker = Linker(*compile_ltoir_functions, options=options) - linked_code = linker.link("ptx") - assert isinstance(linked_code, ObjectCode) +# def test_linker_link_ptx(compile_ltoir_functions): +# options = LinkerOptions(arch=ARCH, link_time_optimization=True, ptx=True) +# linker = Linker(*compile_ltoir_functions, options=options) +# linked_code = linker.link("ptx") +# assert isinstance(linked_code, ObjectCode) def test_linker_link_cubin(compile_ptx_functions): From d7bf4cb304404d6b001fa0e5df479a6d1f9fd514 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Tue, 3 Dec 2024 18:28:38 -0800 Subject: [PATCH 16/29] save progress to remote --- cuda_core/cuda/core/experimental/_linker.py | 10 +++------- cuda_core/tests/test_linker.py | 4 ++-- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index 304b3771..79328583 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -306,22 +306,18 @@ def _init_driver(self): self.option_keys.append(_driver.CUjit_option.CU_JIT_FMA) if self.kernels_used is not None: for kernel in self.kernels_used: - self.formatted_options.append(kernel) + self.formatted_options.append(kernel.encode()) self.option_keys.append(_driver.CUjit_option.CU_JIT_REFERENCED_KERNEL_NAMES) if self.variables_used is not None: for variable in self.variables_used: - self.formatted_options.append(variable) + self.formatted_options.append(variable.encode()) self.option_keys.append(_driver.CUjit_option.CU_JIT_REFERENCED_VARIABLE_NAMES) if self.optimize_unused_variables is not None: self.formatted_options.append(1) # ctypes.c_int32(1) self.option_keys.append(_driver.CUjit_option.CU_JIT_OPTIMIZE_UNUSED_DEVICE_VARIABLES) if self.xptxas is not None: for opt in self.xptxas: - self.formatted_options.append(opt) - self.option_keys.append(_driver.CUjit_option.CU_JIT_FAST_COMPILE) - if self.split_compile is not None: - self.formatted_options.append(self.split_compile) - self.option_keys.append(_driver.CUjit_option.CU_JIT_THREADS_PER_BLOCK) + raise NotImplementedError("TODO: implement xptxas option") if self.split_compile_extended is not None: self.formatted_options.append(self.split_compile_extended) self.option_keys.append(_driver.CUjit_option.CU_JIT_MIN_CTA_PER_SM) diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 1851c7ba..3937c878 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -41,8 +41,8 @@ def compile_ltoir_functions(init_cuda): LinkerOptions(arch=ARCH, prec_div=True), LinkerOptions(arch=ARCH, prec_sqrt=True), LinkerOptions(arch=ARCH, fma=True), - # LinkerOptions(arch=ARCH, kernels_used=["kernel1"]), - # LinkerOptions(arch=ARCH, variables_used=["var1"]), + LinkerOptions(arch=ARCH, kernels_used=["kernel1"]), + LinkerOptions(arch=ARCH, variables_used=["var1"]), LinkerOptions(arch=ARCH, optimize_unused_variables=True), # LinkerOptions(arch=ARCH, xptxas=["-v"]), # LinkerOptions(arch=ARCH, split_compile=0), From 702fbaa550f1b40f14fa35a656bcfc5817b96ff9 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 4 Dec 2024 11:54:23 -0800 Subject: [PATCH 17/29] handle culink and nvjitlink differences in the backend and test --- cuda_core/cuda/core/experimental/_linker.py | 50 ++++++++------------ cuda_core/tests/test_linker.py | 51 +++++++++++++++------ 2 files changed, 55 insertions(+), 46 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index 79328583..39d6cd27 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -29,7 +29,6 @@ def _lazy_init(): _driver_ver = handle_return(cuda.cuDriverGetVersion()) _driver_ver = (_driver_ver // 1000, (_driver_ver % 1000) // 10) try: - raise ImportError from cuda.bindings import nvjitlink from cuda.bindings._internal import nvjitlink as inner_nvjitlink except ImportError: @@ -247,7 +246,7 @@ def _init_nvjitlink(self): self.formatted_options.append(f"-split-compile={self.split_compile}") if self.split_compile_extended is not None: self.formatted_options.append(f"-split-compile-extended={self.split_compile_extended}") - if self.no_cache is not None: + if self.no_cache is True: self.formatted_options.append("-no-cache") def _init_driver(self): @@ -272,57 +271,46 @@ def _init_driver(self): self.formatted_options.append(self.max_register_count) self.option_keys.append(_driver.CUjit_option.CU_JIT_MAX_REGISTERS) if self.time is not None: - self.formatted_options.append(1) # ctypes.c_int32(1) - self.option_keys.append(_driver.CUjit_option.CU_JIT_WALL_TIME) + raise ValueError("time option is not supported by the driver API") if self.verbose is not None: - self.formatted_options.append(1) # ctypes.c_int32(1) + self.formatted_options.append(1) self.option_keys.append(_driver.CUjit_option.CU_JIT_LOG_VERBOSE) if self.link_time_optimization is not None: - self.formatted_options.append(1) # ctypes.c_int32(1) + self.formatted_options.append(1) self.option_keys.append(_driver.CUjit_option.CU_JIT_LTO) if self.ptx is not None: - self.formatted_options.append(1) # ctypes.c_int32(1) - self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_LINE_INFO) + raise ValueError("ptx option is not supported by the driver API") if self.optimization_level is not None: self.formatted_options.append(self.optimization_level) self.option_keys.append(_driver.CUjit_option.CU_JIT_OPTIMIZATION_LEVEL) if self.debug is not None: - self.formatted_options.append(1) # ctypes.c_int32(1) + self.formatted_options.append(1) self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_DEBUG_INFO) if self.lineinfo is not None: - self.formatted_options.append(1) # ctypes.c_int32(1) + self.formatted_options.append(1) self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_LINE_INFO) if self.ftz is not None: - self.formatted_options.append(1 if self.ftz else 0) - self.option_keys.append(_driver.CUjit_option.CU_JIT_FTZ) + raise ValueError("ftz option is deprecated in the driver API") if self.prec_div is not None: - self.formatted_options.append(1 if self.prec_div else 0) - self.option_keys.append(_driver.CUjit_option.CU_JIT_PREC_DIV) + raise ValueError("prec_div option is deprecated in the driver API") if self.prec_sqrt is not None: - self.formatted_options.append(1 if self.prec_sqrt else 0) - self.option_keys.append(_driver.CUjit_option.CU_JIT_PREC_SQRT) + raise ValueError("prec_sqrt option is deprecated in the driver API") if self.fma is not None: - self.formatted_options.append(1 if self.fma else 0) - self.option_keys.append(_driver.CUjit_option.CU_JIT_FMA) + raise ValueError("fma options is deprecated in the driver API") if self.kernels_used is not None: - for kernel in self.kernels_used: - self.formatted_options.append(kernel.encode()) - self.option_keys.append(_driver.CUjit_option.CU_JIT_REFERENCED_KERNEL_NAMES) + raise ValueError("kernels_used is deprecated in the driver API") if self.variables_used is not None: - for variable in self.variables_used: - self.formatted_options.append(variable.encode()) - self.option_keys.append(_driver.CUjit_option.CU_JIT_REFERENCED_VARIABLE_NAMES) + raise ValueError("variables_used is deprecated in the driver API") if self.optimize_unused_variables is not None: - self.formatted_options.append(1) # ctypes.c_int32(1) - self.option_keys.append(_driver.CUjit_option.CU_JIT_OPTIMIZE_UNUSED_DEVICE_VARIABLES) + raise ValueError("optimize_unused_variables is deprecated in the driver API") if self.xptxas is not None: - for opt in self.xptxas: - raise NotImplementedError("TODO: implement xptxas option") + raise ValueError("xptxas option is not supported by the driver API") + if self.split_compile is not None: + raise ValueError("split_compile option is not supported by the driver API") if self.split_compile_extended is not None: - self.formatted_options.append(self.split_compile_extended) - self.option_keys.append(_driver.CUjit_option.CU_JIT_MIN_CTA_PER_SM) + raise ValueError("split_compile_extended option is not supported by the driver API") if self.no_cache is not None: - self.formatted_options.append(1) # ctypes.c_int32(1) + self.formatted_options.append(_driver.CUjit_cacheMode.CU_JIT_CACHE_OPTION_NONE) self.option_keys.append(_driver.CUjit_option.CU_JIT_CACHE_MODE) diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 3937c878..db9ff657 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -8,6 +8,17 @@ basic_kernel = "__device__ int B() { return 0; }" addition_kernel = "__device__ int C(int a, int b) { return a + b; }" +try: + from cuda.bindings import nvjitlink # noqa F401 + from cuda.bindings._internal import nvjitlink as inner_nvjitlink +except ImportError: + # binding is not available + culink_backend = True +else: + if inner_nvjitlink._inspect_function_pointer("__nvJitLinkVersion") == 0: + # binding is available, but nvJitLink is not installed + culink_backend = True + @pytest.fixture(scope="function") def compile_ptx_functions(init_cuda): @@ -27,27 +38,36 @@ def compile_ltoir_functions(init_cuda): return object_code_a_ltoir, object_code_b_ltoir, object_code_c_ltoir +culink_options = [ + LinkerOptions(arch=ARCH), + LinkerOptions(arch=ARCH, max_register_count=32), + LinkerOptions(arch=ARCH, verbose=True), + LinkerOptions(arch=ARCH, optimization_level=3), + LinkerOptions(arch=ARCH, debug=True), + LinkerOptions(arch=ARCH, lineinfo=True), + LinkerOptions(arch=ARCH, no_cache=True), +] + + @pytest.mark.parametrize( "options", - [ - LinkerOptions(arch=ARCH), - LinkerOptions(arch=ARCH, max_register_count=32), + culink_options + if culink_backend + else culink_options + + [ LinkerOptions(arch=ARCH, time=True), - LinkerOptions(arch=ARCH, verbose=True), - LinkerOptions(arch=ARCH, optimization_level=3), - LinkerOptions(arch=ARCH, debug=True), - LinkerOptions(arch=ARCH, lineinfo=True), LinkerOptions(arch=ARCH, ftz=True), LinkerOptions(arch=ARCH, prec_div=True), LinkerOptions(arch=ARCH, prec_sqrt=True), LinkerOptions(arch=ARCH, fma=True), LinkerOptions(arch=ARCH, kernels_used=["kernel1"]), + LinkerOptions(arch=ARCH, kernels_used=["kernel1", "kernel2"]), LinkerOptions(arch=ARCH, variables_used=["var1"]), + LinkerOptions(arch=ARCH, variables_used=["var1", "var2"]), LinkerOptions(arch=ARCH, optimize_unused_variables=True), - # LinkerOptions(arch=ARCH, xptxas=["-v"]), - # LinkerOptions(arch=ARCH, split_compile=0), + LinkerOptions(arch=ARCH, xptxas=["-v"]), + LinkerOptions(arch=ARCH, split_compile=0), LinkerOptions(arch=ARCH, split_compile_extended=1), - # LinkerOptions(arch=ARCH, no_cache=True), ], ) def test_linker_init(compile_ptx_functions, options): @@ -62,11 +82,12 @@ def test_linker_init_invalid_arch(): Linker(options) -# def test_linker_link_ptx(compile_ltoir_functions): -# options = LinkerOptions(arch=ARCH, link_time_optimization=True, ptx=True) -# linker = Linker(*compile_ltoir_functions, options=options) -# linked_code = linker.link("ptx") -# assert isinstance(linked_code, ObjectCode) +@pytest.mark.skipif(culink_backend, reason="culink does not support ptx option") +def test_linker_link_ptx(compile_ltoir_functions): + options = LinkerOptions(arch=ARCH, link_time_optimization=True, ptx=True) + linker = Linker(*compile_ltoir_functions, options=options) + linked_code = linker.link("ptx") + assert isinstance(linked_code, ObjectCode) def test_linker_link_cubin(compile_ptx_functions): From 996ab39a58d1e9495e9ba946527164879fc648f8 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 4 Dec 2024 13:45:05 -0800 Subject: [PATCH 18/29] update line endings --- cuda_core/cuda/core/experimental/_linker.py | 888 ++++++++++---------- cuda_core/tests/test_linker.py | 240 +++--- 2 files changed, 564 insertions(+), 564 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index 39d6cd27..7d95d371 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -1,444 +1,444 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. -# -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE - -import ctypes -import weakref -from dataclasses import dataclass -from typing import List, Optional - -from cuda import cuda -from cuda.core.experimental._module import ObjectCode -from cuda.core.experimental._utils import check_or_create_options, handle_return - -# TODO: revisit this treatment for py313t builds -_driver = None # populated if nvJitLink cannot be used -_driver_input_types = None # populated if nvJitLink cannot be used -_driver_ver = None -_inited = False -_nvjitlink = None # populated if nvJitLink can be used -_nvjitlink_input_types = None # populated if nvJitLink cannot be used - - -def _lazy_init(): - global _inited - if _inited: - return - - global _driver, _driver_input_types, _driver_ver, _nvjitlink, _nvjitlink_input_types - _driver_ver = handle_return(cuda.cuDriverGetVersion()) - _driver_ver = (_driver_ver // 1000, (_driver_ver % 1000) // 10) - try: - from cuda.bindings import nvjitlink - from cuda.bindings._internal import nvjitlink as inner_nvjitlink - except ImportError: - # binding is not available - nvjitlink = None - else: - if inner_nvjitlink._inspect_function_pointer("__nvJitLinkVersion") == 0: - # binding is available, but nvJitLink is not installed - nvjitlink = None - elif _driver_ver > nvjitlink.version(): - # TODO: nvJitLink is not new enough, warn? - pass - if nvjitlink: - _nvjitlink = nvjitlink - _nvjitlink_input_types = { - "ptx": _nvjitlink.InputType.PTX, - "cubin": _nvjitlink.InputType.CUBIN, - "fatbin": _nvjitlink.InputType.FATBIN, - "ltoir": _nvjitlink.InputType.LTOIR, - "object": _nvjitlink.InputType.OBJECT, - } - else: - from cuda import cuda as _driver - - _driver_input_types = { - "ptx": _driver.CUjitInputType.CU_JIT_INPUT_PTX, - "cubin": _driver.CUjitInputType.CU_JIT_INPUT_CUBIN, - "fatbin": _driver.CUjitInputType.CU_JIT_INPUT_FATBINARY, - "object": _driver.CUjitInputType.CU_JIT_INPUT_OBJECT, - } - _inited = True - - -@dataclass -class LinkerOptions: - """Customizable :obj:`LinkerOptions` for nvJitLink. - - Attributes - ---------- - arch : str - Pass SM architecture value. Can use compute_ value instead if only generating PTX. - This is a required option. - Acceptable value type: str - Maps to: -arch=sm_ - max_register_count : int, optional - Maximum register count. - Default: None - Acceptable value type: int - Maps to: -maxrregcount= - time : bool, optional - Print timing information to InfoLog. - Default: False - Acceptable value type: bool - Maps to: -time - verbose : bool, optional - Print verbose messages to InfoLog. - Default: False - Acceptable value type: bool - Maps to: -verbose - link_time_optimization : bool, optional - Perform link time optimization. - Default: False - Acceptable value type: bool - Maps to: -lto - ptx : bool, optional - Emit PTX after linking instead of CUBIN; only supported with -lto. - Default: False - Acceptable value type: bool - Maps to: -ptx - optimization_level : int, optional - Set optimization level. Only 0 and 3 are accepted. - Default: None - Acceptable value type: int - Maps to: -O - debug : bool, optional - Generate debug information. - Default: False - Acceptable value type: bool - Maps to: -g - lineinfo : bool, optional - Generate line information. - Default: False - Acceptable value type: bool - Maps to: -lineinfo - ftz : bool, optional - Flush denormal values to zero. - Default: False - Acceptable value type: bool - Maps to: -ftz= - prec_div : bool, optional - Use precise division. - Default: True - Acceptable value type: bool - Maps to: -prec-div= - prec_sqrt : bool, optional - Use precise square root. - Default: True - Acceptable value type: bool - Maps to: -prec-sqrt= - fma : bool, optional - Use fast multiply-add. - Default: True - Acceptable value type: bool - Maps to: -fma= - kernels_used : List[str], optional - Pass list of kernels that are used; any not in the list can be removed. This option can be specified multiple - times. - Default: None - Acceptable value type: list of str - Maps to: -kernels-used= - variables_used : List[str], optional - Pass list of variables that are used; any not in the list can be removed. This option can be specified multiple - times. - Default: None - Acceptable value type: list of str - Maps to: -variables-used= - optimize_unused_variables : bool, optional - Assume that if a variable is not referenced in device code, it can be removed. - Default: False - Acceptable value type: bool - Maps to: -optimize-unused-variables - xptxas : List[str], optional - Pass options to PTXAS. This option can be called multiple times. - Default: None - Acceptable value type: list of str - Maps to: -Xptxas= - split_compile : int, optional - Split compilation maximum thread count. Use 0 to use all available processors. Value of 1 disables split - compilation (default). - Default: 1 - Acceptable value type: int - Maps to: -split-compile= - split_compile_extended : int, optional - A more aggressive form of split compilation available in LTO mode only. Accepts a maximum thread count value. - Use 0 to use all available processors. Value of 1 disables extended split compilation (default). Note: This - option can potentially impact performance of the compiled binary. - Default: 1 - Acceptable value type: int - Maps to: -split-compile-extended= - no_cache : bool, optional - Do not cache the intermediate steps of nvJitLink. - Default: False - Acceptable value type: bool - Maps to: -no-cache - """ - - arch: str - max_register_count: Optional[int] = None - time: Optional[bool] = None - verbose: Optional[bool] = None - link_time_optimization: Optional[bool] = None - ptx: Optional[bool] = None - optimization_level: Optional[int] = None - debug: Optional[bool] = None - lineinfo: Optional[bool] = None - ftz: Optional[bool] = None - prec_div: Optional[bool] = None - prec_sqrt: Optional[bool] = None - fma: Optional[bool] = None - kernels_used: Optional[List[str]] = None - variables_used: Optional[List[str]] = None - optimize_unused_variables: Optional[bool] = None - xptxas: Optional[List[str]] = None - split_compile: Optional[int] = None - split_compile_extended: Optional[int] = None - no_cache: Optional[bool] = None - - def __post_init__(self): - _lazy_init() - self.formatted_options = [] - if _nvjitlink: - self._init_nvjitlink() - else: - self._init_driver() - - def _init_nvjitlink(self): - if self.arch is not None: - self.formatted_options.append(f"-arch={self.arch}") - if self.max_register_count is not None: - self.formatted_options.append(f"-maxrregcount={self.max_register_count}") - if self.time is not None: - self.formatted_options.append("-time") - if self.verbose is not None: - self.formatted_options.append("-verbose") - if self.link_time_optimization is not None: - self.formatted_options.append("-lto") - if self.ptx is not None: - self.formatted_options.append("-ptx") - if self.optimization_level is not None: - self.formatted_options.append(f"-O{self.optimization_level}") - if self.debug is not None: - self.formatted_options.append("-g") - if self.lineinfo is not None: - self.formatted_options.append("-lineinfo") - if self.ftz is not None: - self.formatted_options.append(f"-ftz={'true' if self.ftz else 'false'}") - if self.prec_div is not None: - self.formatted_options.append(f"-prec-div={'true' if self.prec_div else 'false'}") - if self.prec_sqrt is not None: - self.formatted_options.append(f"-prec-sqrt={'true' if self.prec_sqrt else 'false'}") - if self.fma is not None: - self.formatted_options.append(f"-fma={'true' if self.fma else 'false'}") - if self.kernels_used is not None: - for kernel in self.kernels_used: - self.formatted_options.append(f"-kernels-used={kernel}") - if self.variables_used is not None: - for variable in self.variables_used: - self.formatted_options.append(f"-variables-used={variable}") - if self.optimize_unused_variables is not None: - self.formatted_options.append("-optimize-unused-variables") - if self.xptxas is not None: - for opt in self.xptxas: - self.formatted_options.append(f"-Xptxas={opt}") - if self.split_compile is not None: - self.formatted_options.append(f"-split-compile={self.split_compile}") - if self.split_compile_extended is not None: - self.formatted_options.append(f"-split-compile-extended={self.split_compile_extended}") - if self.no_cache is True: - self.formatted_options.append("-no-cache") - - def _init_driver(self): - self.option_keys = [] - # allocate 4 KiB each for info/error logs - size = 4194304 - self.formatted_options.extend((bytearray(size), size, bytearray(size), size)) - self.option_keys.extend( - ( - _driver.CUjit_option.CU_JIT_INFO_LOG_BUFFER, - _driver.CUjit_option.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, - _driver.CUjit_option.CU_JIT_ERROR_LOG_BUFFER, - _driver.CUjit_option.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, - ) - ) - - if self.arch is not None: - arch = self.arch.split("_")[-1].upper() - self.formatted_options.append(getattr(_driver.CUjit_target, f"CU_TARGET_COMPUTE_{arch}")) - self.option_keys.append(_driver.CUjit_option.CU_JIT_TARGET) - if self.max_register_count is not None: - self.formatted_options.append(self.max_register_count) - self.option_keys.append(_driver.CUjit_option.CU_JIT_MAX_REGISTERS) - if self.time is not None: - raise ValueError("time option is not supported by the driver API") - if self.verbose is not None: - self.formatted_options.append(1) - self.option_keys.append(_driver.CUjit_option.CU_JIT_LOG_VERBOSE) - if self.link_time_optimization is not None: - self.formatted_options.append(1) - self.option_keys.append(_driver.CUjit_option.CU_JIT_LTO) - if self.ptx is not None: - raise ValueError("ptx option is not supported by the driver API") - if self.optimization_level is not None: - self.formatted_options.append(self.optimization_level) - self.option_keys.append(_driver.CUjit_option.CU_JIT_OPTIMIZATION_LEVEL) - if self.debug is not None: - self.formatted_options.append(1) - self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_DEBUG_INFO) - if self.lineinfo is not None: - self.formatted_options.append(1) - self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_LINE_INFO) - if self.ftz is not None: - raise ValueError("ftz option is deprecated in the driver API") - if self.prec_div is not None: - raise ValueError("prec_div option is deprecated in the driver API") - if self.prec_sqrt is not None: - raise ValueError("prec_sqrt option is deprecated in the driver API") - if self.fma is not None: - raise ValueError("fma options is deprecated in the driver API") - if self.kernels_used is not None: - raise ValueError("kernels_used is deprecated in the driver API") - if self.variables_used is not None: - raise ValueError("variables_used is deprecated in the driver API") - if self.optimize_unused_variables is not None: - raise ValueError("optimize_unused_variables is deprecated in the driver API") - if self.xptxas is not None: - raise ValueError("xptxas option is not supported by the driver API") - if self.split_compile is not None: - raise ValueError("split_compile option is not supported by the driver API") - if self.split_compile_extended is not None: - raise ValueError("split_compile_extended option is not supported by the driver API") - if self.no_cache is not None: - self.formatted_options.append(_driver.CUjit_cacheMode.CU_JIT_CACHE_OPTION_NONE) - self.option_keys.append(_driver.CUjit_option.CU_JIT_CACHE_MODE) - - -class Linker: - """ - Linker class for managing the linking of object codes with specified options. - - Parameters - ---------- - object_codes : ObjectCode - One or more ObjectCode objects to be linked. - options : LinkerOptions, optional - Options for the linker. If not provided, default options will be used. - """ - - class _MembersNeededForFinalize: - __slots__ = ("handle", "use_nvjitlink") - - def __init__(self, program_obj, handle, use_nvjitlink): - self.handle = handle - self.use_nvjitlink = use_nvjitlink - weakref.finalize(program_obj, self.close) - - def close(self): - if self.handle is not None: - if self.use_nvjitlink: - _nvjitlink.destroy(self.handle) - else: - handle_return(_driver.cuLinkDestroy(self.handle)) - self.handle = None - - __slots__ = ("__weakref__", "_mnff", "_options") - - def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): - if len(object_codes) == 0: - raise ValueError("At least one ObjectCode object must be provided") - - self._options = options = check_or_create_options(LinkerOptions, options, "Linker options") - if _nvjitlink: - handle = _nvjitlink.create(len(options.formatted_options), options.formatted_options) - use_nvjitlink = True - else: - handle = handle_return( - _driver.cuLinkCreate(len(options.formatted_options), options.option_keys, options.formatted_options) - ) - use_nvjitlink = False - self._mnff = Linker._MembersNeededForFinalize(self, handle, use_nvjitlink) - - for code in object_codes: - assert isinstance(code, ObjectCode) - self._add_code_object(code) - - def _add_code_object(self, object_code: ObjectCode): - data = object_code._module - assert isinstance(data, bytes) - if _nvjitlink: - _nvjitlink.add_data( - self._mnff.handle, - self._input_type_from_code_type(object_code._code_type), - data, - len(data), - f"{object_code._handle}_{object_code._code_type}", - ) - else: - handle_return( - _driver.cuLinkAddData( - self._mnff.handle, - self._input_type_from_code_type(object_code._code_type), - data, - len(data), - f"{object_code._handle}_{object_code._code_type}".encode(), - 0, - None, - None, - ) - ) - - def link(self, target_type) -> ObjectCode: - if target_type not in ("cubin", "ptx"): - raise ValueError(f"Unsupported target type: {target_type}") - if _nvjitlink: - _nvjitlink.complete(self._mnff.handle) - if target_type == "cubin": - get_size = _nvjitlink.get_linked_cubin_size - get_code = _nvjitlink.get_linked_cubin - else: - get_size = _nvjitlink.get_linked_ptx_size - get_code = _nvjitlink.get_linked_ptx - - size = get_size(self._mnff.handle) - code = bytearray(size) - get_code(self._mnff.handle, code) - else: - addr, size = handle_return(_driver.cuLinkComplete(self._mnff.handle)) - code = (ctypes.c_char * size).from_address(addr) - - return ObjectCode(bytes(code), target_type) - - def get_error_log(self) -> str: - if _nvjitlink: - log_size = _nvjitlink.get_error_log_size(self._mnff.handle) - log = bytearray(log_size) - _nvjitlink.get_error_log(self._mnff.handle, log) - else: - log = self._options.formatted_options[2] - return log.decode() - - def get_info_log(self) -> str: - if _nvjitlink: - log_size = _nvjitlink.get_info_log_size(self._mnff.handle) - log = bytearray(log_size) - _nvjitlink.get_info_log(self._mnff.handle, log) - else: - log = self._options.formatted_options[0] - return log.decode() - - def _input_type_from_code_type(self, code_type: str): - # this list is based on the supported values for code_type in the ObjectCode class definition. - # nvJitLink/driver support other options for input type - input_type = _nvjitlink_input_types.get(code_type) if _nvjitlink else _driver_input_types.get(code_type) - - if input_type is None: - raise ValueError(f"Unknown code_type associated with ObjectCode: {code_type}") - return input_type - - @property - def handle(self) -> int: - return self._mnff.handle - - def close(self): - self._mnff.close() +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +import ctypes +import weakref +from dataclasses import dataclass +from typing import List, Optional + +from cuda import cuda +from cuda.core.experimental._module import ObjectCode +from cuda.core.experimental._utils import check_or_create_options, handle_return + +# TODO: revisit this treatment for py313t builds +_driver = None # populated if nvJitLink cannot be used +_driver_input_types = None # populated if nvJitLink cannot be used +_driver_ver = None +_inited = False +_nvjitlink = None # populated if nvJitLink can be used +_nvjitlink_input_types = None # populated if nvJitLink cannot be used + + +def _lazy_init(): + global _inited + if _inited: + return + + global _driver, _driver_input_types, _driver_ver, _nvjitlink, _nvjitlink_input_types + _driver_ver = handle_return(cuda.cuDriverGetVersion()) + _driver_ver = (_driver_ver // 1000, (_driver_ver % 1000) // 10) + try: + from cuda.bindings import nvjitlink + from cuda.bindings._internal import nvjitlink as inner_nvjitlink + except ImportError: + # binding is not available + nvjitlink = None + else: + if inner_nvjitlink._inspect_function_pointer("__nvJitLinkVersion") == 0: + # binding is available, but nvJitLink is not installed + nvjitlink = None + elif _driver_ver > nvjitlink.version(): + # TODO: nvJitLink is not new enough, warn? + pass + if nvjitlink: + _nvjitlink = nvjitlink + _nvjitlink_input_types = { + "ptx": _nvjitlink.InputType.PTX, + "cubin": _nvjitlink.InputType.CUBIN, + "fatbin": _nvjitlink.InputType.FATBIN, + "ltoir": _nvjitlink.InputType.LTOIR, + "object": _nvjitlink.InputType.OBJECT, + } + else: + from cuda import cuda as _driver + + _driver_input_types = { + "ptx": _driver.CUjitInputType.CU_JIT_INPUT_PTX, + "cubin": _driver.CUjitInputType.CU_JIT_INPUT_CUBIN, + "fatbin": _driver.CUjitInputType.CU_JIT_INPUT_FATBINARY, + "object": _driver.CUjitInputType.CU_JIT_INPUT_OBJECT, + } + _inited = True + + +@dataclass +class LinkerOptions: + """Customizable :obj:`LinkerOptions` for nvJitLink. + + Attributes + ---------- + arch : str + Pass SM architecture value. Can use compute_ value instead if only generating PTX. + This is a required option. + Acceptable value type: str + Maps to: -arch=sm_ + max_register_count : int, optional + Maximum register count. + Default: None + Acceptable value type: int + Maps to: -maxrregcount= + time : bool, optional + Print timing information to InfoLog. + Default: False + Acceptable value type: bool + Maps to: -time + verbose : bool, optional + Print verbose messages to InfoLog. + Default: False + Acceptable value type: bool + Maps to: -verbose + link_time_optimization : bool, optional + Perform link time optimization. + Default: False + Acceptable value type: bool + Maps to: -lto + ptx : bool, optional + Emit PTX after linking instead of CUBIN; only supported with -lto. + Default: False + Acceptable value type: bool + Maps to: -ptx + optimization_level : int, optional + Set optimization level. Only 0 and 3 are accepted. + Default: None + Acceptable value type: int + Maps to: -O + debug : bool, optional + Generate debug information. + Default: False + Acceptable value type: bool + Maps to: -g + lineinfo : bool, optional + Generate line information. + Default: False + Acceptable value type: bool + Maps to: -lineinfo + ftz : bool, optional + Flush denormal values to zero. + Default: False + Acceptable value type: bool + Maps to: -ftz= + prec_div : bool, optional + Use precise division. + Default: True + Acceptable value type: bool + Maps to: -prec-div= + prec_sqrt : bool, optional + Use precise square root. + Default: True + Acceptable value type: bool + Maps to: -prec-sqrt= + fma : bool, optional + Use fast multiply-add. + Default: True + Acceptable value type: bool + Maps to: -fma= + kernels_used : List[str], optional + Pass list of kernels that are used; any not in the list can be removed. This option can be specified multiple + times. + Default: None + Acceptable value type: list of str + Maps to: -kernels-used= + variables_used : List[str], optional + Pass list of variables that are used; any not in the list can be removed. This option can be specified multiple + times. + Default: None + Acceptable value type: list of str + Maps to: -variables-used= + optimize_unused_variables : bool, optional + Assume that if a variable is not referenced in device code, it can be removed. + Default: False + Acceptable value type: bool + Maps to: -optimize-unused-variables + xptxas : List[str], optional + Pass options to PTXAS. This option can be called multiple times. + Default: None + Acceptable value type: list of str + Maps to: -Xptxas= + split_compile : int, optional + Split compilation maximum thread count. Use 0 to use all available processors. Value of 1 disables split + compilation (default). + Default: 1 + Acceptable value type: int + Maps to: -split-compile= + split_compile_extended : int, optional + A more aggressive form of split compilation available in LTO mode only. Accepts a maximum thread count value. + Use 0 to use all available processors. Value of 1 disables extended split compilation (default). Note: This + option can potentially impact performance of the compiled binary. + Default: 1 + Acceptable value type: int + Maps to: -split-compile-extended= + no_cache : bool, optional + Do not cache the intermediate steps of nvJitLink. + Default: False + Acceptable value type: bool + Maps to: -no-cache + """ + + arch: str + max_register_count: Optional[int] = None + time: Optional[bool] = None + verbose: Optional[bool] = None + link_time_optimization: Optional[bool] = None + ptx: Optional[bool] = None + optimization_level: Optional[int] = None + debug: Optional[bool] = None + lineinfo: Optional[bool] = None + ftz: Optional[bool] = None + prec_div: Optional[bool] = None + prec_sqrt: Optional[bool] = None + fma: Optional[bool] = None + kernels_used: Optional[List[str]] = None + variables_used: Optional[List[str]] = None + optimize_unused_variables: Optional[bool] = None + xptxas: Optional[List[str]] = None + split_compile: Optional[int] = None + split_compile_extended: Optional[int] = None + no_cache: Optional[bool] = None + + def __post_init__(self): + _lazy_init() + self.formatted_options = [] + if _nvjitlink: + self._init_nvjitlink() + else: + self._init_driver() + + def _init_nvjitlink(self): + if self.arch is not None: + self.formatted_options.append(f"-arch={self.arch}") + if self.max_register_count is not None: + self.formatted_options.append(f"-maxrregcount={self.max_register_count}") + if self.time is not None: + self.formatted_options.append("-time") + if self.verbose is not None: + self.formatted_options.append("-verbose") + if self.link_time_optimization is not None: + self.formatted_options.append("-lto") + if self.ptx is not None: + self.formatted_options.append("-ptx") + if self.optimization_level is not None: + self.formatted_options.append(f"-O{self.optimization_level}") + if self.debug is not None: + self.formatted_options.append("-g") + if self.lineinfo is not None: + self.formatted_options.append("-lineinfo") + if self.ftz is not None: + self.formatted_options.append(f"-ftz={'true' if self.ftz else 'false'}") + if self.prec_div is not None: + self.formatted_options.append(f"-prec-div={'true' if self.prec_div else 'false'}") + if self.prec_sqrt is not None: + self.formatted_options.append(f"-prec-sqrt={'true' if self.prec_sqrt else 'false'}") + if self.fma is not None: + self.formatted_options.append(f"-fma={'true' if self.fma else 'false'}") + if self.kernels_used is not None: + for kernel in self.kernels_used: + self.formatted_options.append(f"-kernels-used={kernel}") + if self.variables_used is not None: + for variable in self.variables_used: + self.formatted_options.append(f"-variables-used={variable}") + if self.optimize_unused_variables is not None: + self.formatted_options.append("-optimize-unused-variables") + if self.xptxas is not None: + for opt in self.xptxas: + self.formatted_options.append(f"-Xptxas={opt}") + if self.split_compile is not None: + self.formatted_options.append(f"-split-compile={self.split_compile}") + if self.split_compile_extended is not None: + self.formatted_options.append(f"-split-compile-extended={self.split_compile_extended}") + if self.no_cache is True: + self.formatted_options.append("-no-cache") + + def _init_driver(self): + self.option_keys = [] + # allocate 4 KiB each for info/error logs + size = 4194304 + self.formatted_options.extend((bytearray(size), size, bytearray(size), size)) + self.option_keys.extend( + ( + _driver.CUjit_option.CU_JIT_INFO_LOG_BUFFER, + _driver.CUjit_option.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, + _driver.CUjit_option.CU_JIT_ERROR_LOG_BUFFER, + _driver.CUjit_option.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, + ) + ) + + if self.arch is not None: + arch = self.arch.split("_")[-1].upper() + self.formatted_options.append(getattr(_driver.CUjit_target, f"CU_TARGET_COMPUTE_{arch}")) + self.option_keys.append(_driver.CUjit_option.CU_JIT_TARGET) + if self.max_register_count is not None: + self.formatted_options.append(self.max_register_count) + self.option_keys.append(_driver.CUjit_option.CU_JIT_MAX_REGISTERS) + if self.time is not None: + raise ValueError("time option is not supported by the driver API") + if self.verbose is not None: + self.formatted_options.append(1) + self.option_keys.append(_driver.CUjit_option.CU_JIT_LOG_VERBOSE) + if self.link_time_optimization is not None: + self.formatted_options.append(1) + self.option_keys.append(_driver.CUjit_option.CU_JIT_LTO) + if self.ptx is not None: + raise ValueError("ptx option is not supported by the driver API") + if self.optimization_level is not None: + self.formatted_options.append(self.optimization_level) + self.option_keys.append(_driver.CUjit_option.CU_JIT_OPTIMIZATION_LEVEL) + if self.debug is not None: + self.formatted_options.append(1) + self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_DEBUG_INFO) + if self.lineinfo is not None: + self.formatted_options.append(1) + self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_LINE_INFO) + if self.ftz is not None: + raise ValueError("ftz option is deprecated in the driver API") + if self.prec_div is not None: + raise ValueError("prec_div option is deprecated in the driver API") + if self.prec_sqrt is not None: + raise ValueError("prec_sqrt option is deprecated in the driver API") + if self.fma is not None: + raise ValueError("fma options is deprecated in the driver API") + if self.kernels_used is not None: + raise ValueError("kernels_used is deprecated in the driver API") + if self.variables_used is not None: + raise ValueError("variables_used is deprecated in the driver API") + if self.optimize_unused_variables is not None: + raise ValueError("optimize_unused_variables is deprecated in the driver API") + if self.xptxas is not None: + raise ValueError("xptxas option is not supported by the driver API") + if self.split_compile is not None: + raise ValueError("split_compile option is not supported by the driver API") + if self.split_compile_extended is not None: + raise ValueError("split_compile_extended option is not supported by the driver API") + if self.no_cache is not None: + self.formatted_options.append(_driver.CUjit_cacheMode.CU_JIT_CACHE_OPTION_NONE) + self.option_keys.append(_driver.CUjit_option.CU_JIT_CACHE_MODE) + + +class Linker: + """ + Linker class for managing the linking of object codes with specified options. + + Parameters + ---------- + object_codes : ObjectCode + One or more ObjectCode objects to be linked. + options : LinkerOptions, optional + Options for the linker. If not provided, default options will be used. + """ + + class _MembersNeededForFinalize: + __slots__ = ("handle", "use_nvjitlink") + + def __init__(self, program_obj, handle, use_nvjitlink): + self.handle = handle + self.use_nvjitlink = use_nvjitlink + weakref.finalize(program_obj, self.close) + + def close(self): + if self.handle is not None: + if self.use_nvjitlink: + _nvjitlink.destroy(self.handle) + else: + handle_return(_driver.cuLinkDestroy(self.handle)) + self.handle = None + + __slots__ = ("__weakref__", "_mnff", "_options") + + def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): + if len(object_codes) == 0: + raise ValueError("At least one ObjectCode object must be provided") + + self._options = options = check_or_create_options(LinkerOptions, options, "Linker options") + if _nvjitlink: + handle = _nvjitlink.create(len(options.formatted_options), options.formatted_options) + use_nvjitlink = True + else: + handle = handle_return( + _driver.cuLinkCreate(len(options.formatted_options), options.option_keys, options.formatted_options) + ) + use_nvjitlink = False + self._mnff = Linker._MembersNeededForFinalize(self, handle, use_nvjitlink) + + for code in object_codes: + assert isinstance(code, ObjectCode) + self._add_code_object(code) + + def _add_code_object(self, object_code: ObjectCode): + data = object_code._module + assert isinstance(data, bytes) + if _nvjitlink: + _nvjitlink.add_data( + self._mnff.handle, + self._input_type_from_code_type(object_code._code_type), + data, + len(data), + f"{object_code._handle}_{object_code._code_type}", + ) + else: + handle_return( + _driver.cuLinkAddData( + self._mnff.handle, + self._input_type_from_code_type(object_code._code_type), + data, + len(data), + f"{object_code._handle}_{object_code._code_type}".encode(), + 0, + None, + None, + ) + ) + + def link(self, target_type) -> ObjectCode: + if target_type not in ("cubin", "ptx"): + raise ValueError(f"Unsupported target type: {target_type}") + if _nvjitlink: + _nvjitlink.complete(self._mnff.handle) + if target_type == "cubin": + get_size = _nvjitlink.get_linked_cubin_size + get_code = _nvjitlink.get_linked_cubin + else: + get_size = _nvjitlink.get_linked_ptx_size + get_code = _nvjitlink.get_linked_ptx + + size = get_size(self._mnff.handle) + code = bytearray(size) + get_code(self._mnff.handle, code) + else: + addr, size = handle_return(_driver.cuLinkComplete(self._mnff.handle)) + code = (ctypes.c_char * size).from_address(addr) + + return ObjectCode(bytes(code), target_type) + + def get_error_log(self) -> str: + if _nvjitlink: + log_size = _nvjitlink.get_error_log_size(self._mnff.handle) + log = bytearray(log_size) + _nvjitlink.get_error_log(self._mnff.handle, log) + else: + log = self._options.formatted_options[2] + return log.decode() + + def get_info_log(self) -> str: + if _nvjitlink: + log_size = _nvjitlink.get_info_log_size(self._mnff.handle) + log = bytearray(log_size) + _nvjitlink.get_info_log(self._mnff.handle, log) + else: + log = self._options.formatted_options[0] + return log.decode() + + def _input_type_from_code_type(self, code_type: str): + # this list is based on the supported values for code_type in the ObjectCode class definition. + # nvJitLink/driver support other options for input type + input_type = _nvjitlink_input_types.get(code_type) if _nvjitlink else _driver_input_types.get(code_type) + + if input_type is None: + raise ValueError(f"Unknown code_type associated with ObjectCode: {code_type}") + return input_type + + @property + def handle(self) -> int: + return self._mnff.handle + + def close(self): + self._mnff.close() diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index db9ff657..15496b59 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -1,120 +1,120 @@ -import pytest - -from cuda.core.experimental import Linker, LinkerOptions, Program -from cuda.core.experimental._module import ObjectCode - -ARCH = "sm_80" # use sm_80 for testing the oop nvJitLink wrapper -empty_kernel = "__device__ void A() {}" -basic_kernel = "__device__ int B() { return 0; }" -addition_kernel = "__device__ int C(int a, int b) { return a + b; }" - -try: - from cuda.bindings import nvjitlink # noqa F401 - from cuda.bindings._internal import nvjitlink as inner_nvjitlink -except ImportError: - # binding is not available - culink_backend = True -else: - if inner_nvjitlink._inspect_function_pointer("__nvJitLinkVersion") == 0: - # binding is available, but nvJitLink is not installed - culink_backend = True - - -@pytest.fixture(scope="function") -def compile_ptx_functions(init_cuda): - object_code_a_ptx = Program(empty_kernel, "c++").compile("ptx") - object_code_b_ptx = Program(basic_kernel, "c++").compile("ptx") - object_code_c_ptx = Program(addition_kernel, "c++").compile("ptx") - - return object_code_a_ptx, object_code_b_ptx, object_code_c_ptx - - -@pytest.fixture(scope="function") -def compile_ltoir_functions(init_cuda): - object_code_a_ltoir = Program(empty_kernel, "c++").compile("ltoir", options=("-dlto",)) - object_code_b_ltoir = Program(basic_kernel, "c++").compile("ltoir", options=("-dlto",)) - object_code_c_ltoir = Program(addition_kernel, "c++").compile("ltoir", options=("-dlto",)) - - return object_code_a_ltoir, object_code_b_ltoir, object_code_c_ltoir - - -culink_options = [ - LinkerOptions(arch=ARCH), - LinkerOptions(arch=ARCH, max_register_count=32), - LinkerOptions(arch=ARCH, verbose=True), - LinkerOptions(arch=ARCH, optimization_level=3), - LinkerOptions(arch=ARCH, debug=True), - LinkerOptions(arch=ARCH, lineinfo=True), - LinkerOptions(arch=ARCH, no_cache=True), -] - - -@pytest.mark.parametrize( - "options", - culink_options - if culink_backend - else culink_options - + [ - LinkerOptions(arch=ARCH, time=True), - LinkerOptions(arch=ARCH, ftz=True), - LinkerOptions(arch=ARCH, prec_div=True), - LinkerOptions(arch=ARCH, prec_sqrt=True), - LinkerOptions(arch=ARCH, fma=True), - LinkerOptions(arch=ARCH, kernels_used=["kernel1"]), - LinkerOptions(arch=ARCH, kernels_used=["kernel1", "kernel2"]), - LinkerOptions(arch=ARCH, variables_used=["var1"]), - LinkerOptions(arch=ARCH, variables_used=["var1", "var2"]), - LinkerOptions(arch=ARCH, optimize_unused_variables=True), - LinkerOptions(arch=ARCH, xptxas=["-v"]), - LinkerOptions(arch=ARCH, split_compile=0), - LinkerOptions(arch=ARCH, split_compile_extended=1), - ], -) -def test_linker_init(compile_ptx_functions, options): - linker = Linker(*compile_ptx_functions, options=options) - object_code = linker.link("cubin") - assert isinstance(object_code, ObjectCode) - - -def test_linker_init_invalid_arch(): - options = LinkerOptions(arch=None) - with pytest.raises(TypeError): - Linker(options) - - -@pytest.mark.skipif(culink_backend, reason="culink does not support ptx option") -def test_linker_link_ptx(compile_ltoir_functions): - options = LinkerOptions(arch=ARCH, link_time_optimization=True, ptx=True) - linker = Linker(*compile_ltoir_functions, options=options) - linked_code = linker.link("ptx") - assert isinstance(linked_code, ObjectCode) - - -def test_linker_link_cubin(compile_ptx_functions): - options = LinkerOptions(arch=ARCH) - linker = Linker(*compile_ptx_functions, options=options) - linked_code = linker.link("cubin") - assert isinstance(linked_code, ObjectCode) - - -def test_linker_link_invalid_target_type(compile_ptx_functions): - options = LinkerOptions(arch=ARCH) - linker = Linker(*compile_ptx_functions, options=options) - with pytest.raises(ValueError): - linker.link("invalid_target") - - -def test_linker_get_error_log(compile_ptx_functions): - options = LinkerOptions(arch=ARCH) - linker = Linker(*compile_ptx_functions, options=options) - linker.link("cubin") - log = linker.get_error_log() - assert isinstance(log, str) - - -def test_linker_get_info_log(compile_ptx_functions): - options = LinkerOptions(arch=ARCH) - linker = Linker(*compile_ptx_functions, options=options) - linker.link("cubin") - log = linker.get_info_log() - assert isinstance(log, str) +import pytest + +from cuda.core.experimental import Linker, LinkerOptions, Program +from cuda.core.experimental._module import ObjectCode + +ARCH = "sm_80" # use sm_80 for testing the oop nvJitLink wrapper +empty_kernel = "__device__ void A() {}" +basic_kernel = "__device__ int B() { return 0; }" +addition_kernel = "__device__ int C(int a, int b) { return a + b; }" + +try: + from cuda.bindings import nvjitlink # noqa F401 + from cuda.bindings._internal import nvjitlink as inner_nvjitlink +except ImportError: + # binding is not available + culink_backend = True +else: + if inner_nvjitlink._inspect_function_pointer("__nvJitLinkVersion") == 0: + # binding is available, but nvJitLink is not installed + culink_backend = True + + +@pytest.fixture(scope="function") +def compile_ptx_functions(init_cuda): + object_code_a_ptx = Program(empty_kernel, "c++").compile("ptx") + object_code_b_ptx = Program(basic_kernel, "c++").compile("ptx") + object_code_c_ptx = Program(addition_kernel, "c++").compile("ptx") + + return object_code_a_ptx, object_code_b_ptx, object_code_c_ptx + + +@pytest.fixture(scope="function") +def compile_ltoir_functions(init_cuda): + object_code_a_ltoir = Program(empty_kernel, "c++").compile("ltoir", options=("-dlto",)) + object_code_b_ltoir = Program(basic_kernel, "c++").compile("ltoir", options=("-dlto",)) + object_code_c_ltoir = Program(addition_kernel, "c++").compile("ltoir", options=("-dlto",)) + + return object_code_a_ltoir, object_code_b_ltoir, object_code_c_ltoir + + +culink_options = [ + LinkerOptions(arch=ARCH), + LinkerOptions(arch=ARCH, max_register_count=32), + LinkerOptions(arch=ARCH, verbose=True), + LinkerOptions(arch=ARCH, optimization_level=3), + LinkerOptions(arch=ARCH, debug=True), + LinkerOptions(arch=ARCH, lineinfo=True), + LinkerOptions(arch=ARCH, no_cache=True), +] + + +@pytest.mark.parametrize( + "options", + culink_options + if culink_backend + else culink_options + + [ + LinkerOptions(arch=ARCH, time=True), + LinkerOptions(arch=ARCH, ftz=True), + LinkerOptions(arch=ARCH, prec_div=True), + LinkerOptions(arch=ARCH, prec_sqrt=True), + LinkerOptions(arch=ARCH, fma=True), + LinkerOptions(arch=ARCH, kernels_used=["kernel1"]), + LinkerOptions(arch=ARCH, kernels_used=["kernel1", "kernel2"]), + LinkerOptions(arch=ARCH, variables_used=["var1"]), + LinkerOptions(arch=ARCH, variables_used=["var1", "var2"]), + LinkerOptions(arch=ARCH, optimize_unused_variables=True), + LinkerOptions(arch=ARCH, xptxas=["-v"]), + LinkerOptions(arch=ARCH, split_compile=0), + LinkerOptions(arch=ARCH, split_compile_extended=1), + ], +) +def test_linker_init(compile_ptx_functions, options): + linker = Linker(*compile_ptx_functions, options=options) + object_code = linker.link("cubin") + assert isinstance(object_code, ObjectCode) + + +def test_linker_init_invalid_arch(): + options = LinkerOptions(arch=None) + with pytest.raises(TypeError): + Linker(options) + + +@pytest.mark.skipif(culink_backend, reason="culink does not support ptx option") +def test_linker_link_ptx(compile_ltoir_functions): + options = LinkerOptions(arch=ARCH, link_time_optimization=True, ptx=True) + linker = Linker(*compile_ltoir_functions, options=options) + linked_code = linker.link("ptx") + assert isinstance(linked_code, ObjectCode) + + +def test_linker_link_cubin(compile_ptx_functions): + options = LinkerOptions(arch=ARCH) + linker = Linker(*compile_ptx_functions, options=options) + linked_code = linker.link("cubin") + assert isinstance(linked_code, ObjectCode) + + +def test_linker_link_invalid_target_type(compile_ptx_functions): + options = LinkerOptions(arch=ARCH) + linker = Linker(*compile_ptx_functions, options=options) + with pytest.raises(ValueError): + linker.link("invalid_target") + + +def test_linker_get_error_log(compile_ptx_functions): + options = LinkerOptions(arch=ARCH) + linker = Linker(*compile_ptx_functions, options=options) + linker.link("cubin") + log = linker.get_error_log() + assert isinstance(log, str) + + +def test_linker_get_info_log(compile_ptx_functions): + options = LinkerOptions(arch=ARCH) + linker = Linker(*compile_ptx_functions, options=options) + linker.link("cubin") + log = linker.get_info_log() + assert isinstance(log, str) From 8ed625615a6bb6cd700b818620ab923c9a72ce38 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 4 Dec 2024 14:59:17 -0800 Subject: [PATCH 19/29] update the test --- cuda_core/cuda/core/experimental/_linker.py | 2 +- cuda_core/docs/source/api.rst | 3 -- cuda_core/tests/test_linker.py | 39 ++++++++++++++------- 3 files changed, 27 insertions(+), 17 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index 7d95d371..bf232cad 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -309,7 +309,7 @@ def _init_driver(self): raise ValueError("split_compile option is not supported by the driver API") if self.split_compile_extended is not None: raise ValueError("split_compile_extended option is not supported by the driver API") - if self.no_cache is not None: + if self.no_cache is True: self.formatted_options.append(_driver.CUjit_cacheMode.CU_JIT_CACHE_OPTION_NONE) self.option_keys.append(_driver.CUjit_option.CU_JIT_CACHE_MODE) diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index a6a34e40..c3e66b52 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -31,13 +31,11 @@ CUDA compilation toolchain :toctree: generated/ Program -<<<<<<< HEAD Linker :template: dataclass.rst LinkerOptions -======= .. module:: cuda.core.experimental.utils @@ -53,4 +51,3 @@ Utility functions :template: dataclass.rst StridedMemoryView ->>>>>>> origin/main diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 15496b59..f5dc33dd 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -4,10 +4,15 @@ from cuda.core.experimental._module import ObjectCode ARCH = "sm_80" # use sm_80 for testing the oop nvJitLink wrapper -empty_kernel = "__device__ void A() {}" -basic_kernel = "__device__ int B() { return 0; }" -addition_kernel = "__device__ int C(int a, int b) { return a + b; }" - +device_function_a = """ +__device__ int B(); +__device__ int C(int a, int b); +__device__ void A() { int result = C(B(), 1);} +""" +device_function_b = "__device__ int B() { return 0; }" +device_function_c = "__device__ int C(int a, int b) { return a + b; }" + +culink_backend = False try: from cuda.bindings import nvjitlink # noqa F401 from cuda.bindings._internal import nvjitlink as inner_nvjitlink @@ -22,18 +27,18 @@ @pytest.fixture(scope="function") def compile_ptx_functions(init_cuda): - object_code_a_ptx = Program(empty_kernel, "c++").compile("ptx") - object_code_b_ptx = Program(basic_kernel, "c++").compile("ptx") - object_code_c_ptx = Program(addition_kernel, "c++").compile("ptx") + object_code_b_ptx = Program(device_function_b, "c++").compile("ptx") + object_code_c_ptx = Program(device_function_c, "c++").compile("ptx") + object_code_a_ptx = Program(device_function_a, "c++").compile("ptx") return object_code_a_ptx, object_code_b_ptx, object_code_c_ptx @pytest.fixture(scope="function") def compile_ltoir_functions(init_cuda): - object_code_a_ltoir = Program(empty_kernel, "c++").compile("ltoir", options=("-dlto",)) - object_code_b_ltoir = Program(basic_kernel, "c++").compile("ltoir", options=("-dlto",)) - object_code_c_ltoir = Program(addition_kernel, "c++").compile("ltoir", options=("-dlto",)) + object_code_b_ltoir = Program(device_function_b, "c++").compile("ltoir", options=("-dlto",)) + object_code_c_ltoir = Program(device_function_c, "c++").compile("ltoir", options=("-dlto",)) + object_code_a_ltoir = Program(device_function_a, "c++").compile("ltoir", options=("-dlto",)) return object_code_a_ltoir, object_code_b_ltoir, object_code_c_ltoir @@ -60,8 +65,8 @@ def compile_ltoir_functions(init_cuda): LinkerOptions(arch=ARCH, prec_div=True), LinkerOptions(arch=ARCH, prec_sqrt=True), LinkerOptions(arch=ARCH, fma=True), - LinkerOptions(arch=ARCH, kernels_used=["kernel1"]), - LinkerOptions(arch=ARCH, kernels_used=["kernel1", "kernel2"]), + LinkerOptions(arch=ARCH, kernels_used=["A"]), + LinkerOptions(arch=ARCH, kernels_used=["C", "B"]), LinkerOptions(arch=ARCH, variables_used=["var1"]), LinkerOptions(arch=ARCH, variables_used=["var1", "var2"]), LinkerOptions(arch=ARCH, optimize_unused_variables=True), @@ -83,13 +88,21 @@ def test_linker_init_invalid_arch(): @pytest.mark.skipif(culink_backend, reason="culink does not support ptx option") -def test_linker_link_ptx(compile_ltoir_functions): +def test_linker_link_ptx_nvjitlink(compile_ltoir_functions): options = LinkerOptions(arch=ARCH, link_time_optimization=True, ptx=True) linker = Linker(*compile_ltoir_functions, options=options) linked_code = linker.link("ptx") assert isinstance(linked_code, ObjectCode) +@pytest.mark.skipif(not culink_backend, reason="nvjitlink requires lto for ptx linking") +def test_linker_link_ptx_culink(compile_ptx_functions): + options = LinkerOptions(arch=ARCH) + linker = Linker(*compile_ptx_functions, options=options) + linked_code = linker.link("ptx") + assert isinstance(linked_code, ObjectCode) + + def test_linker_link_cubin(compile_ptx_functions): options = LinkerOptions(arch=ARCH) linker = Linker(*compile_ptx_functions, options=options) From 188ae6223fbb2a8dc567551627483b75230149f8 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 4 Dec 2024 15:01:38 -0800 Subject: [PATCH 20/29] update the test --- cuda_core/tests/test_linker.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index f5dc33dd..b4008ab6 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -4,6 +4,8 @@ from cuda.core.experimental._module import ObjectCode ARCH = "sm_80" # use sm_80 for testing the oop nvJitLink wrapper + + device_function_a = """ __device__ int B(); __device__ int C(int a, int b); From 0522d2b71fb682199dfd801f61501aebc030f82e Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 4 Dec 2024 15:10:51 -0800 Subject: [PATCH 21/29] update the documentation to touch on LinkerOptions vs CUDA version --- cuda_core/docs/source/release/0.1.1-notes.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cuda_core/docs/source/release/0.1.1-notes.md b/cuda_core/docs/source/release/0.1.1-notes.md index 6e491a62..29694f4a 100644 --- a/cuda_core/docs/source/release/0.1.1-notes.md +++ b/cuda_core/docs/source/release/0.1.1-notes.md @@ -6,9 +6,13 @@ Released on Nov , 2024 - Add `StridedMemoryView` and `@args_viewable_as_strided_memory` that provide a concrete implementation of DLPack & CUDA Array Interface supports. -- Addition of the Linker class which gives object oriented and pythonic access to the nvJitLink API. +- Addition of the Linker class which gives object oriented and pythonic access to the nvJitLink or cuLink API + depending on your CUDA version. ## Limitations - All APIs are currently *experimental* and subject to change without deprecation notice. Please kindly share your feedbacks with us so that we can make `cuda.core` better! +- Some LinkerOptions are only available when using a modern version of CUDA. When using CUDA <12, + the backend is the cuLink api which supports only a subset of the options that nvjitlink does. + Further, some options aren't available on CUDA versions <12.6 From df8cbea9727a7438b9013d2c695a8d6a21560537 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Thu, 5 Dec 2024 10:38:26 -0800 Subject: [PATCH 22/29] use rdc for nvrtc compilation and improve exception reporting by dumping the log --- cuda_core/cuda/core/experimental/_linker.py | 45 +++++++++++++++++---- cuda_core/tests/test_linker.py | 12 +++--- 2 files changed, 44 insertions(+), 13 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index bf232cad..b6c28ba5 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -64,7 +64,10 @@ def _lazy_init(): @dataclass class LinkerOptions: - """Customizable :obj:`LinkerOptions` for nvJitLink. + """Customizable :obj:`LinkerOptions` for nvJitLink or driver API. Some options are only available + whenusing the cuda.bindings.nvjitlink backend. Some options are only available when using newer + or older versions of cuda. + Attributes ---------- @@ -350,11 +353,16 @@ def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): self._options = options = check_or_create_options(LinkerOptions, options, "Linker options") if _nvjitlink: - handle = _nvjitlink.create(len(options.formatted_options), options.formatted_options) + handle = self._exception_manager( + lambda: _nvjitlink.create(len(options.formatted_options), options.formatted_options) + ) + use_nvjitlink = True else: - handle = handle_return( - _driver.cuLinkCreate(len(options.formatted_options), options.option_keys, options.formatted_options) + handle = self._exception_manager( + lambda: handle_return( + _driver.cuLinkCreate(len(options.formatted_options), options.option_keys, options.formatted_options) + ) ) use_nvjitlink = False self._mnff = Linker._MembersNeededForFinalize(self, handle, use_nvjitlink) @@ -363,6 +371,27 @@ def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): assert isinstance(code, ObjectCode) self._add_code_object(code) + def _exception_manager(self, action): + """ + Helper function to improve the error message of excepotions raised by the linker backend. + + Parameters + ---------- + action : callable + The action to be performed. + + Returns + ------- + The return value of the action. + """ + try: + return action() + except Exception as e: + error = self.get_error_log() + raise RuntimeError( + f"Exception raised by {"nvjitlink" if _nvjitlink else "cuLink"}: {e}.\nLinker error log: {error}" + ) from e + def _add_code_object(self, object_code: ObjectCode): data = object_code._module assert isinstance(data, bytes) @@ -392,7 +421,7 @@ def link(self, target_type) -> ObjectCode: if target_type not in ("cubin", "ptx"): raise ValueError(f"Unsupported target type: {target_type}") if _nvjitlink: - _nvjitlink.complete(self._mnff.handle) + self._exception_manager(lambda: _nvjitlink.complete(self._mnff.handle)) if target_type == "cubin": get_size = _nvjitlink.get_linked_cubin_size get_code = _nvjitlink.get_linked_cubin @@ -400,11 +429,11 @@ def link(self, target_type) -> ObjectCode: get_size = _nvjitlink.get_linked_ptx_size get_code = _nvjitlink.get_linked_ptx - size = get_size(self._mnff.handle) + size = self._exception_manager(lambda: get_size(self._mnff.handle)) code = bytearray(size) - get_code(self._mnff.handle, code) + self._exception_manager(lambda: get_code(self._mnff.handle, code)) else: - addr, size = handle_return(_driver.cuLinkComplete(self._mnff.handle)) + addr, size = self._exception_manager(lambda: handle_return(_driver.cuLinkComplete(self._mnff.handle))) code = (ctypes.c_char * size).from_address(addr) return ObjectCode(bytes(code), target_type) diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index b4008ab6..6163d9a8 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -9,7 +9,7 @@ device_function_a = """ __device__ int B(); __device__ int C(int a, int b); -__device__ void A() { int result = C(B(), 1);} +__global__ void A() { int result = C(B(), 1);} """ device_function_b = "__device__ int B() { return 0; }" device_function_c = "__device__ int C(int a, int b) { return a + b; }" @@ -29,9 +29,11 @@ @pytest.fixture(scope="function") def compile_ptx_functions(init_cuda): - object_code_b_ptx = Program(device_function_b, "c++").compile("ptx") - object_code_c_ptx = Program(device_function_c, "c++").compile("ptx") - object_code_a_ptx = Program(device_function_a, "c++").compile("ptx") + # Without rdc (relocatable device code) option, the generated ptx will not included any unreferenced + # device functions, causing the link to fail + object_code_b_ptx = Program(device_function_b, "c++").compile("ptx", options=("-rdc=true",)) + object_code_c_ptx = Program(device_function_c, "c++").compile("ptx", options=("-rdc=true",)) + object_code_a_ptx = Program(device_function_a, "c++").compile("ptx", options=("-rdc=true",)) return object_code_a_ptx, object_code_b_ptx, object_code_c_ptx @@ -46,7 +48,7 @@ def compile_ltoir_functions(init_cuda): culink_options = [ - LinkerOptions(arch=ARCH), + LinkerOptions(arch=ARCH, verbose=True), LinkerOptions(arch=ARCH, max_register_count=32), LinkerOptions(arch=ARCH, verbose=True), LinkerOptions(arch=ARCH, optimization_level=3), From 761bea0b83252519d986d8111e88cad5edaebfcf Mon Sep 17 00:00:00 2001 From: ksimpson Date: Thu, 5 Dec 2024 10:52:32 -0800 Subject: [PATCH 23/29] add note to link() --- cuda_core/cuda/core/experimental/_linker.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index b6c28ba5..01c4a0e9 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -418,6 +418,24 @@ def _add_code_object(self, object_code: ObjectCode): ) def link(self, target_type) -> ObjectCode: + """ + Links the provided object codes into a single output of the specified target type. + + Parameters + ---------- + target_type : str + The type of the target output. Must be either "cubin" or "ptx". + + Returns + ------- + ObjectCode + The linked object code of the specified target type. + + Note + ------ + See nvrtc compiler options documnetation to ensure the input ObjectCodes are + correctly compiled for linking. + """ if target_type not in ("cubin", "ptx"): raise ValueError(f"Unsupported target type: {target_type}") if _nvjitlink: From 9fdbc9fe49cd9eb6a84fdf78326f5db7ea584969 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Fri, 6 Dec 2024 15:30:04 -0800 Subject: [PATCH 24/29] remove duplicate test --- cuda_core/tests/test_linker.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 6163d9a8..a9b5d1c2 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -50,7 +50,6 @@ def compile_ltoir_functions(init_cuda): culink_options = [ LinkerOptions(arch=ARCH, verbose=True), LinkerOptions(arch=ARCH, max_register_count=32), - LinkerOptions(arch=ARCH, verbose=True), LinkerOptions(arch=ARCH, optimization_level=3), LinkerOptions(arch=ARCH, debug=True), LinkerOptions(arch=ARCH, lineinfo=True), From 677bd6df015834ad5fed6d2cd075623c1935229b Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 7 Dec 2024 02:38:20 +0000 Subject: [PATCH 25/29] reuse backend decision logic in tests + some nitpicks --- cuda_core/cuda/core/experimental/_linker.py | 36 ++++++++++++++------- cuda_core/tests/test_linker.py | 27 +++++----------- 2 files changed, 32 insertions(+), 31 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index 01c4a0e9..8cd603d1 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -20,29 +20,43 @@ _nvjitlink_input_types = None # populated if nvJitLink cannot be used -def _lazy_init(): - global _inited - if _inited: +# Note: this function is reused in the tests +def _decide_nvjitlink_or_driver(): + """Returns True if falling back to the cuLink* driver APIs.""" + global _driver_ver, _driver, _nvjitlink + if _driver or _nvjitlink: return - global _driver, _driver_input_types, _driver_ver, _nvjitlink, _nvjitlink_input_types _driver_ver = handle_return(cuda.cuDriverGetVersion()) _driver_ver = (_driver_ver // 1000, (_driver_ver % 1000) // 10) try: - from cuda.bindings import nvjitlink + from cuda.bindings import nvjitlink as _nvjitlink from cuda.bindings._internal import nvjitlink as inner_nvjitlink except ImportError: # binding is not available - nvjitlink = None + _nvjitlink = None else: if inner_nvjitlink._inspect_function_pointer("__nvJitLinkVersion") == 0: # binding is available, but nvJitLink is not installed - nvjitlink = None - elif _driver_ver > nvjitlink.version(): + _nvjitlink = None + + if _nvjitlink is None: + _driver = cuda + return True + else: + return False + + +def _lazy_init(): + global _inited, _nvjitlink_input_types, _driver_input_types + if _inited: + return + + _decide_nvjitlink_or_driver() + if _nvjitlink: + if _driver_ver > _nvjitlink.version(): # TODO: nvJitLink is not new enough, warn? pass - if nvjitlink: - _nvjitlink = nvjitlink _nvjitlink_input_types = { "ptx": _nvjitlink.InputType.PTX, "cubin": _nvjitlink.InputType.CUBIN, @@ -51,8 +65,6 @@ def _lazy_init(): "object": _nvjitlink.InputType.OBJECT, } else: - from cuda import cuda as _driver - _driver_input_types = { "ptx": _driver.CUjitInputType.CU_JIT_INPUT_PTX, "cubin": _driver.CUjitInputType.CU_JIT_INPUT_CUBIN, diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index a9b5d1c2..1af746f8 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -1,48 +1,37 @@ import pytest -from cuda.core.experimental import Linker, LinkerOptions, Program +from cuda.core.experimental import Linker, LinkerOptions, Program, _linker from cuda.core.experimental._module import ObjectCode ARCH = "sm_80" # use sm_80 for testing the oop nvJitLink wrapper - -device_function_a = """ -__device__ int B(); -__device__ int C(int a, int b); +kernel_a = """ +extern __device__ int B(); +extern __device__ int C(int a, int b); __global__ void A() { int result = C(B(), 1);} """ device_function_b = "__device__ int B() { return 0; }" device_function_c = "__device__ int C(int a, int b) { return a + b; }" -culink_backend = False -try: - from cuda.bindings import nvjitlink # noqa F401 - from cuda.bindings._internal import nvjitlink as inner_nvjitlink -except ImportError: - # binding is not available - culink_backend = True -else: - if inner_nvjitlink._inspect_function_pointer("__nvJitLinkVersion") == 0: - # binding is available, but nvJitLink is not installed - culink_backend = True +culink_backend = _linker._decide_nvjitlink_or_driver() @pytest.fixture(scope="function") def compile_ptx_functions(init_cuda): - # Without rdc (relocatable device code) option, the generated ptx will not included any unreferenced + # Without -rdc (relocatable device code) option, the generated ptx will not included any unreferenced # device functions, causing the link to fail + object_code_a_ptx = Program(kernel_a, "c++").compile("ptx", options=("-rdc=true",)) object_code_b_ptx = Program(device_function_b, "c++").compile("ptx", options=("-rdc=true",)) object_code_c_ptx = Program(device_function_c, "c++").compile("ptx", options=("-rdc=true",)) - object_code_a_ptx = Program(device_function_a, "c++").compile("ptx", options=("-rdc=true",)) return object_code_a_ptx, object_code_b_ptx, object_code_c_ptx @pytest.fixture(scope="function") def compile_ltoir_functions(init_cuda): + object_code_a_ltoir = Program(kernel_a, "c++").compile("ltoir", options=("-dlto",)) object_code_b_ltoir = Program(device_function_b, "c++").compile("ltoir", options=("-dlto",)) object_code_c_ltoir = Program(device_function_c, "c++").compile("ltoir", options=("-dlto",)) - object_code_a_ltoir = Program(device_function_a, "c++").compile("ltoir", options=("-dlto",)) return object_code_a_ltoir, object_code_b_ltoir, object_code_c_ltoir From 758ae01a7855775b88b91ab7250c34c33225eca8 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 7 Dec 2024 03:24:23 +0000 Subject: [PATCH 26/29] make _exception_manager a ctx mgr --- cuda_core/cuda/core/experimental/_linker.py | 88 ++++++++++----------- 1 file changed, 42 insertions(+), 46 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index 8cd603d1..c27ea94d 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -4,6 +4,7 @@ import ctypes import weakref +from contextlib import contextmanager from dataclasses import dataclass from typing import List, Optional @@ -329,6 +330,26 @@ def _init_driver(self): self.option_keys.append(_driver.CUjit_option.CU_JIT_CACHE_MODE) +# This needs to be a free function not a method, as it's disallowed by contextmanager. +@contextmanager +def _exception_manager(self): + """ + A helper function to improve the error message of exceptions raised by the linker backend. + """ + try: + yield + except Exception as e: + error_log = "" + if hasattr(self, "_mnff"): + # our constructor could raise, in which case there's no handle available + error_log = self.get_error_log() + # Starting Python 3.11 we could also use Exception.add_note() for the same purpose, but + # unfortunately we are still supporting Python 3.9/3.10... + # Here we rely on both CUDAError and nvJitLinkError have the error string placed in .args[0]. + e.args = (e.args[0] + (f"\nLinker error log: {error_log}" if error_log else ""), *e.args[1:]) + raise e + + class Linker: """ Linker class for managing the linking of object codes with specified options. @@ -364,46 +385,21 @@ def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): raise ValueError("At least one ObjectCode object must be provided") self._options = options = check_or_create_options(LinkerOptions, options, "Linker options") - if _nvjitlink: - handle = self._exception_manager( - lambda: _nvjitlink.create(len(options.formatted_options), options.formatted_options) - ) - - use_nvjitlink = True - else: - handle = self._exception_manager( - lambda: handle_return( + with _exception_manager(self): + if _nvjitlink: + handle = _nvjitlink.create(len(options.formatted_options), options.formatted_options) + use_nvjitlink = True + else: + handle = handle_return( _driver.cuLinkCreate(len(options.formatted_options), options.option_keys, options.formatted_options) ) - ) - use_nvjitlink = False + use_nvjitlink = False self._mnff = Linker._MembersNeededForFinalize(self, handle, use_nvjitlink) for code in object_codes: assert isinstance(code, ObjectCode) self._add_code_object(code) - def _exception_manager(self, action): - """ - Helper function to improve the error message of excepotions raised by the linker backend. - - Parameters - ---------- - action : callable - The action to be performed. - - Returns - ------- - The return value of the action. - """ - try: - return action() - except Exception as e: - error = self.get_error_log() - raise RuntimeError( - f"Exception raised by {"nvjitlink" if _nvjitlink else "cuLink"}: {e}.\nLinker error log: {error}" - ) from e - def _add_code_object(self, object_code: ObjectCode): data = object_code._module assert isinstance(data, bytes) @@ -450,21 +446,21 @@ def link(self, target_type) -> ObjectCode: """ if target_type not in ("cubin", "ptx"): raise ValueError(f"Unsupported target type: {target_type}") - if _nvjitlink: - self._exception_manager(lambda: _nvjitlink.complete(self._mnff.handle)) - if target_type == "cubin": - get_size = _nvjitlink.get_linked_cubin_size - get_code = _nvjitlink.get_linked_cubin + with _exception_manager(self): + if _nvjitlink: + _nvjitlink.complete(self._mnff.handle) + if target_type == "cubin": + get_size = _nvjitlink.get_linked_cubin_size + get_code = _nvjitlink.get_linked_cubin + else: + get_size = _nvjitlink.get_linked_ptx_size + get_code = _nvjitlink.get_linked_ptx + size = get_size(self._mnff.handle) + code = bytearray(size) + get_code(self._mnff.handle, code) else: - get_size = _nvjitlink.get_linked_ptx_size - get_code = _nvjitlink.get_linked_ptx - - size = self._exception_manager(lambda: get_size(self._mnff.handle)) - code = bytearray(size) - self._exception_manager(lambda: get_code(self._mnff.handle, code)) - else: - addr, size = self._exception_manager(lambda: handle_return(_driver.cuLinkComplete(self._mnff.handle))) - code = (ctypes.c_char * size).from_address(addr) + addr, size = handle_return(_driver.cuLinkComplete(self._mnff.handle)) + code = (ctypes.c_char * size).from_address(addr) return ObjectCode(bytes(code), target_type) From 06ee1e28e875e4eab869ce530fd00560ec010a8f Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 7 Dec 2024 03:33:48 +0000 Subject: [PATCH 27/29] also guard the add_data calls with _exception_manager + add missing docstrings --- cuda_core/cuda/core/experimental/_linker.py | 49 ++++++++++++++------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index c27ea94d..a1f93e18 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -403,27 +403,28 @@ def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): def _add_code_object(self, object_code: ObjectCode): data = object_code._module assert isinstance(data, bytes) - if _nvjitlink: - _nvjitlink.add_data( - self._mnff.handle, - self._input_type_from_code_type(object_code._code_type), - data, - len(data), - f"{object_code._handle}_{object_code._code_type}", - ) - else: - handle_return( - _driver.cuLinkAddData( + with _exception_manager(self): + if _nvjitlink: + _nvjitlink.add_data( self._mnff.handle, self._input_type_from_code_type(object_code._code_type), data, len(data), - f"{object_code._handle}_{object_code._code_type}".encode(), - 0, - None, - None, + f"{object_code._handle}_{object_code._code_type}", + ) + else: + handle_return( + _driver.cuLinkAddData( + self._mnff.handle, + self._input_type_from_code_type(object_code._code_type), + data, + len(data), + f"{object_code._handle}_{object_code._code_type}".encode(), + 0, + None, + None, + ) ) - ) def link(self, target_type) -> ObjectCode: """ @@ -465,6 +466,12 @@ def link(self, target_type) -> ObjectCode: return ObjectCode(bytes(code), target_type) def get_error_log(self) -> str: + """ Get the error log generated by the linker. + + Returns + ------- + The error log. + """ if _nvjitlink: log_size = _nvjitlink.get_error_log_size(self._mnff.handle) log = bytearray(log_size) @@ -474,6 +481,12 @@ def get_error_log(self) -> str: return log.decode() def get_info_log(self) -> str: + """Get the info log generated by the linker. + + Returns + ------- + The info log. + """ if _nvjitlink: log_size = _nvjitlink.get_info_log_size(self._mnff.handle) log = bytearray(log_size) @@ -492,8 +505,10 @@ def _input_type_from_code_type(self, code_type: str): return input_type @property - def handle(self) -> int: + def handle(self): + """Return the linker handle object.""" return self._mnff.handle def close(self): + """Destroy this linker.""" self._mnff.close() From faf4855b46d363715ae75921364464b5117cd9e4 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 7 Dec 2024 03:36:11 +0000 Subject: [PATCH 28/29] add missing license header --- cuda_core/tests/test_linker.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 1af746f8..54cd8cf4 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -1,3 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + import pytest from cuda.core.experimental import Linker, LinkerOptions, Program, _linker From 1c9dea6bfc3cb3112ab065366d1d7832f99478a4 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 7 Dec 2024 04:17:40 +0000 Subject: [PATCH 29/29] improve docs --- cuda_core/cuda/core/experimental/_linker.py | 127 ++++++++----------- cuda_core/docs/source/release/0.1.1-notes.md | 9 +- 2 files changed, 58 insertions(+), 78 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index a1f93e18..09a237a4 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -77,118 +77,92 @@ def _lazy_init(): @dataclass class LinkerOptions: - """Customizable :obj:`LinkerOptions` for nvJitLink or driver API. Some options are only available - whenusing the cuda.bindings.nvjitlink backend. Some options are only available when using newer - or older versions of cuda. + """Customizable :obj:`Linker` options. + Since the linker would choose to use nvJitLink or the driver APIs as the linking backed, + not all options are applicable. Attributes ---------- arch : str - Pass SM architecture value. Can use compute_ value instead if only generating PTX. + Pass the SM architecture value, such as ``-arch=sm_`` (for generating CUBIN) or + ``compute_`` (for generating PTX). This is a required option. - Acceptable value type: str - Maps to: -arch=sm_ max_register_count : int, optional Maximum register count. - Default: None - Acceptable value type: int - Maps to: -maxrregcount= + Maps to: ``-maxrregcount=``. time : bool, optional - Print timing information to InfoLog. - Default: False - Acceptable value type: bool - Maps to: -time + Print timing information to the info log. + Maps to ``-time``. + Default: False. verbose : bool, optional - Print verbose messages to InfoLog. - Default: False - Acceptable value type: bool - Maps to: -verbose + Print verbose messages to the info log. + Maps to ``-verbose``. + Default: False. link_time_optimization : bool, optional Perform link time optimization. - Default: False - Acceptable value type: bool - Maps to: -lto + Maps to: ``-lto``. + Default: False. ptx : bool, optional - Emit PTX after linking instead of CUBIN; only supported with -lto. - Default: False - Acceptable value type: bool - Maps to: -ptx + Emit PTX after linking instead of CUBIN; only supported with ``-lto``. + Maps to ``-ptx``. + Default: False. optimization_level : int, optional Set optimization level. Only 0 and 3 are accepted. - Default: None - Acceptable value type: int - Maps to: -O + Maps to ``-O``. debug : bool, optional Generate debug information. - Default: False - Acceptable value type: bool - Maps to: -g + Maps to ``-g`` + Default: False. lineinfo : bool, optional Generate line information. - Default: False - Acceptable value type: bool - Maps to: -lineinfo + Maps to ``-lineinfo``. + Default: False. ftz : bool, optional Flush denormal values to zero. - Default: False - Acceptable value type: bool - Maps to: -ftz= + Maps to ``-ftz=``. + Default: False. prec_div : bool, optional Use precise division. - Default: True - Acceptable value type: bool - Maps to: -prec-div= + Maps to ``-prec-div=``. + Default: True. prec_sqrt : bool, optional Use precise square root. - Default: True - Acceptable value type: bool - Maps to: -prec-sqrt= + Maps to ``-prec-sqrt=``. + Default: True. fma : bool, optional Use fast multiply-add. - Default: True - Acceptable value type: bool - Maps to: -fma= + Maps to ``-fma=``. + Default: True. kernels_used : List[str], optional Pass list of kernels that are used; any not in the list can be removed. This option can be specified multiple times. - Default: None - Acceptable value type: list of str - Maps to: -kernels-used= + Maps to ``-kernels-used=``. variables_used : List[str], optional - Pass list of variables that are used; any not in the list can be removed. This option can be specified multiple - times. - Default: None - Acceptable value type: list of str - Maps to: -variables-used= + Pass a list of variables that are used; any not in the list can be removed. + Maps to ``-variables-used=`` optimize_unused_variables : bool, optional Assume that if a variable is not referenced in device code, it can be removed. - Default: False - Acceptable value type: bool - Maps to: -optimize-unused-variables + Maps to: ``-optimize-unused-variables`` + Default: False. xptxas : List[str], optional - Pass options to PTXAS. This option can be called multiple times. - Default: None - Acceptable value type: list of str - Maps to: -Xptxas= + Pass options to PTXAS. + Maps to: ``-Xptxas=``. split_compile : int, optional Split compilation maximum thread count. Use 0 to use all available processors. Value of 1 disables split compilation (default). - Default: 1 - Acceptable value type: int - Maps to: -split-compile= + Maps to ``-split-compile=``. + Default: 1. split_compile_extended : int, optional A more aggressive form of split compilation available in LTO mode only. Accepts a maximum thread count value. Use 0 to use all available processors. Value of 1 disables extended split compilation (default). Note: This option can potentially impact performance of the compiled binary. - Default: 1 - Acceptable value type: int - Maps to: -split-compile-extended= + Maps to ``-split-compile-extended=``. + Default: 1. no_cache : bool, optional Do not cache the intermediate steps of nvJitLink. - Default: False - Acceptable value type: bool - Maps to: -no-cache + Maps to ``-no-cache``. + Default: False. """ arch: str @@ -351,8 +325,11 @@ def _exception_manager(self): class Linker: - """ - Linker class for managing the linking of object codes with specified options. + """Represent a linking machinery to link one or multiple object codes into + :obj:`~cuda.core.experimental._module.ObjectCode` with the specified options. + + This object provides a unified interface to multiple underlying + linker libraries (such as nvJitLink or cuLink* from CUDA driver). Parameters ---------- @@ -442,7 +419,7 @@ def link(self, target_type) -> ObjectCode: Note ------ - See nvrtc compiler options documnetation to ensure the input ObjectCodes are + See nvrtc compiler options documnetation to ensure the input object codes are correctly compiled for linking. """ if target_type not in ("cubin", "ptx"): @@ -470,7 +447,8 @@ def get_error_log(self) -> str: Returns ------- - The error log. + str + The error log. """ if _nvjitlink: log_size = _nvjitlink.get_error_log_size(self._mnff.handle) @@ -485,7 +463,8 @@ def get_info_log(self) -> str: Returns ------- - The info log. + str + The info log. """ if _nvjitlink: log_size = _nvjitlink.get_info_log_size(self._mnff.handle) diff --git a/cuda_core/docs/source/release/0.1.1-notes.md b/cuda_core/docs/source/release/0.1.1-notes.md index cd3530b9..34cad7d1 100644 --- a/cuda_core/docs/source/release/0.1.1-notes.md +++ b/cuda_core/docs/source/release/0.1.1-notes.md @@ -1,13 +1,14 @@ # `cuda.core` Release notes -Released on Nov , 2024 +Released on Dec XX, 2024 ## Hightlights - Add `StridedMemoryView` and `@args_viewable_as_strided_memory` that provide a concrete implementation of DLPack & CUDA Array Interface supports. -- Addition of the Linker class which gives object oriented and pythonic access to the nvJitLink or cuLink API - depending on your CUDA version. +- Add `Linker` that can link one or multiple `ObjectCode` instances generated by `Program`s. Under + the hood, it uses either the nvJitLink or cuLink APIs depending on the CUDA version detected + in the current environment. - Support TCC devices with a default synchronous memory resource to avoid the use of memory pools @@ -15,6 +16,6 @@ Released on Nov , 2024 - All APIs are currently *experimental* and subject to change without deprecation notice. Please kindly share your feedbacks with us so that we can make `cuda.core` better! -- Some LinkerOptions are only available when using a modern version of CUDA. When using CUDA <12, +- Some `LinkerOptions` are only available when using a modern version of CUDA. When using CUDA <12, the backend is the cuLink api which supports only a subset of the options that nvjitlink does. Further, some options aren't available on CUDA versions <12.6