diff --git a/dev/.documenter-siteinfo.json b/dev/.documenter-siteinfo.json
index effcd626b3..cc096a6118 100644
--- a/dev/.documenter-siteinfo.json
+++ b/dev/.documenter-siteinfo.json
@@ -1 +1 @@
-{"documenter":{"julia_version":"1.10.4","generation_timestamp":"2024-08-26T07:38:08","documenter_version":"1.4.0"}}
\ No newline at end of file
+{"documenter":{"julia_version":"1.10.5","generation_timestamp":"2024-09-04T18:14:38","documenter_version":"1.4.0"}}
\ No newline at end of file
diff --git a/dev/api/array/index.html b/dev/api/array/index.html
index 448066458b..bd88a91311 100644
--- a/dev/api/array/index.html
+++ b/dev/api/array/index.html
@@ -3,4 +3,4 @@
   function gtag(){dataLayer.push(arguments);}
   gtag('js', new Date());
   gtag('config', 'UA-154489943-2', {'page_path': location.pathname + location.search + location.hash});
-</script><script data-outdated-warner src="../../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL="../.."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../../assets/documenter.js"></script><script src="../../search_index.js"></script><script src="../../siteinfo.js"></script><script src="../../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../../assets/themeswap.js"></script><link href="../../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../../"><img src="../../assets/logo.png" alt="CUDA.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../../">CUDA.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../../">Home</a></li><li><span class="tocitem">Tutorials</span><ul><li><a class="tocitem" href="../../tutorials/introduction/">Introduction</a></li><li><a class="tocitem" href="../../tutorials/custom_structs/">Using custom structs</a></li><li><a class="tocitem" href="../../tutorials/performance/">Performance Tips</a></li></ul></li><li><span class="tocitem">Installation</span><ul><li><a class="tocitem" href="../../installation/overview/">Overview</a></li><li><a class="tocitem" href="../../installation/conditional/">Conditional use</a></li><li><a class="tocitem" href="../../installation/troubleshooting/">Troubleshooting</a></li></ul></li><li><span class="tocitem">Usage</span><ul><li><a class="tocitem" href="../../usage/overview/">Overview</a></li><li><a class="tocitem" href="../../usage/workflow/">Workflow</a></li><li><a class="tocitem" href="../../usage/array/">Array programming</a></li><li><a class="tocitem" href="../../usage/memory/">Memory management</a></li><li><a class="tocitem" href="../../usage/multitasking/">Tasks and threads</a></li><li><a class="tocitem" href="../../usage/multigpu/">Multiple GPUs</a></li></ul></li><li><span class="tocitem">Development</span><ul><li><a class="tocitem" href="../../development/profiling/">Benchmarking &amp; profiling</a></li><li><a class="tocitem" href="../../development/kernel/">Kernel programming</a></li><li><a class="tocitem" href="../../development/troubleshooting/">Troubleshooting</a></li><li><a class="tocitem" href="../../development/debugging/">Debugging</a></li></ul></li><li><span class="tocitem">API reference</span><ul><li><a class="tocitem" href="../essentials/">Essentials</a></li><li class="is-active"><a class="tocitem" href>Array programming</a></li><li><a class="tocitem" href="../kernel/">Kernel programming</a></li><li><a class="tocitem" href="../compiler/">Compiler</a></li></ul></li><li><span class="tocitem">Library reference</span><ul><li><a class="tocitem" href="../../lib/driver/">CUDA driver</a></li></ul></li><li><a class="tocitem" href="../../faq/">FAQ</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li><a class="is-disabled">API reference</a></li><li class="is-active"><a href>Array programming</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Array programming</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/CUDA.jl/blob/master/docs/src/api/array.md#" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="ArrayAPI"><a class="docs-heading-anchor" href="#ArrayAPI">Array programming</a><a id="ArrayAPI-1"></a><a class="docs-heading-anchor-permalink" href="#ArrayAPI" title="Permalink"></a></h1><p>The CUDA array type, <code>CuArray</code>, generally implements the Base array interface and all of its expected methods.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../essentials/">« Essentials</a><a class="docs-footer-nextpage" href="../kernel/">Kernel programming »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Monday 26 August 2024 07:38">Monday 26 August 2024</span>. Using Julia version 1.10.4.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+</script><script data-outdated-warner src="../../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL="../.."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../../assets/documenter.js"></script><script src="../../search_index.js"></script><script src="../../siteinfo.js"></script><script src="../../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../../assets/themeswap.js"></script><link href="../../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../../"><img src="../../assets/logo.png" alt="CUDA.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../../">CUDA.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../../">Home</a></li><li><span class="tocitem">Tutorials</span><ul><li><a class="tocitem" href="../../tutorials/introduction/">Introduction</a></li><li><a class="tocitem" href="../../tutorials/custom_structs/">Using custom structs</a></li><li><a class="tocitem" href="../../tutorials/performance/">Performance Tips</a></li></ul></li><li><span class="tocitem">Installation</span><ul><li><a class="tocitem" href="../../installation/overview/">Overview</a></li><li><a class="tocitem" href="../../installation/conditional/">Conditional use</a></li><li><a class="tocitem" href="../../installation/troubleshooting/">Troubleshooting</a></li></ul></li><li><span class="tocitem">Usage</span><ul><li><a class="tocitem" href="../../usage/overview/">Overview</a></li><li><a class="tocitem" href="../../usage/workflow/">Workflow</a></li><li><a class="tocitem" href="../../usage/array/">Array programming</a></li><li><a class="tocitem" href="../../usage/memory/">Memory management</a></li><li><a class="tocitem" href="../../usage/multitasking/">Tasks and threads</a></li><li><a class="tocitem" href="../../usage/multigpu/">Multiple GPUs</a></li></ul></li><li><span class="tocitem">Development</span><ul><li><a class="tocitem" href="../../development/profiling/">Benchmarking &amp; profiling</a></li><li><a class="tocitem" href="../../development/kernel/">Kernel programming</a></li><li><a class="tocitem" href="../../development/troubleshooting/">Troubleshooting</a></li><li><a class="tocitem" href="../../development/debugging/">Debugging</a></li></ul></li><li><span class="tocitem">API reference</span><ul><li><a class="tocitem" href="../essentials/">Essentials</a></li><li class="is-active"><a class="tocitem" href>Array programming</a></li><li><a class="tocitem" href="../kernel/">Kernel programming</a></li><li><a class="tocitem" href="../compiler/">Compiler</a></li></ul></li><li><span class="tocitem">Library reference</span><ul><li><a class="tocitem" href="../../lib/driver/">CUDA driver</a></li></ul></li><li><a class="tocitem" href="../../faq/">FAQ</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li><a class="is-disabled">API reference</a></li><li class="is-active"><a href>Array programming</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Array programming</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/CUDA.jl/blob/master/docs/src/api/array.md#" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="ArrayAPI"><a class="docs-heading-anchor" href="#ArrayAPI">Array programming</a><a id="ArrayAPI-1"></a><a class="docs-heading-anchor-permalink" href="#ArrayAPI" title="Permalink"></a></h1><p>The CUDA array type, <code>CuArray</code>, generally implements the Base array interface and all of its expected methods.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../essentials/">« Essentials</a><a class="docs-footer-nextpage" href="../kernel/">Kernel programming »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Wednesday 4 September 2024 18:14">Wednesday 4 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/api/compiler/index.html b/dev/api/compiler/index.html
index abca5edcb0..6dd825b5c4 100644
--- a/dev/api/compiler/index.html
+++ b/dev/api/compiler/index.html
@@ -3,8 +3,8 @@
   function gtag(){dataLayer.push(arguments);}
   gtag('js', new Date());
   gtag('config', 'UA-154489943-2', {'page_path': location.pathname + location.search + location.hash});
-</script><script data-outdated-warner src="../../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL="../.."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../../assets/documenter.js"></script><script src="../../search_index.js"></script><script src="../../siteinfo.js"></script><script src="../../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../../assets/themeswap.js"></script><link href="../../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../../"><img src="../../assets/logo.png" alt="CUDA.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../../">CUDA.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../../">Home</a></li><li><span class="tocitem">Tutorials</span><ul><li><a class="tocitem" href="../../tutorials/introduction/">Introduction</a></li><li><a class="tocitem" href="../../tutorials/custom_structs/">Using custom structs</a></li><li><a class="tocitem" href="../../tutorials/performance/">Performance Tips</a></li></ul></li><li><span class="tocitem">Installation</span><ul><li><a class="tocitem" href="../../installation/overview/">Overview</a></li><li><a class="tocitem" href="../../installation/conditional/">Conditional use</a></li><li><a class="tocitem" href="../../installation/troubleshooting/">Troubleshooting</a></li></ul></li><li><span class="tocitem">Usage</span><ul><li><a class="tocitem" href="../../usage/overview/">Overview</a></li><li><a class="tocitem" href="../../usage/workflow/">Workflow</a></li><li><a class="tocitem" href="../../usage/array/">Array programming</a></li><li><a class="tocitem" href="../../usage/memory/">Memory management</a></li><li><a class="tocitem" href="../../usage/multitasking/">Tasks and threads</a></li><li><a class="tocitem" href="../../usage/multigpu/">Multiple GPUs</a></li></ul></li><li><span class="tocitem">Development</span><ul><li><a class="tocitem" href="../../development/profiling/">Benchmarking &amp; profiling</a></li><li><a class="tocitem" href="../../development/kernel/">Kernel programming</a></li><li><a class="tocitem" href="../../development/troubleshooting/">Troubleshooting</a></li><li><a class="tocitem" href="../../development/debugging/">Debugging</a></li></ul></li><li><span class="tocitem">API reference</span><ul><li><a class="tocitem" href="../essentials/">Essentials</a></li><li><a class="tocitem" href="../array/">Array programming</a></li><li><a class="tocitem" href="../kernel/">Kernel programming</a></li><li class="is-active"><a class="tocitem" href>Compiler</a><ul class="internal"><li><a class="tocitem" href="#Execution"><span>Execution</span></a></li><li><a class="tocitem" href="#Reflection"><span>Reflection</span></a></li></ul></li></ul></li><li><span class="tocitem">Library reference</span><ul><li><a class="tocitem" href="../../lib/driver/">CUDA driver</a></li></ul></li><li><a class="tocitem" href="../../faq/">FAQ</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li><a class="is-disabled">API reference</a></li><li class="is-active"><a href>Compiler</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Compiler</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/CUDA.jl/blob/master/docs/src/api/compiler.md#" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Compiler"><a class="docs-heading-anchor" href="#Compiler">Compiler</a><a id="Compiler-1"></a><a class="docs-heading-anchor-permalink" href="#Compiler" title="Permalink"></a></h1><h2 id="Execution"><a class="docs-heading-anchor" href="#Execution">Execution</a><a id="Execution-1"></a><a class="docs-heading-anchor-permalink" href="#Execution" title="Permalink"></a></h2><p>The main entry-point to the compiler is the <code>@cuda</code> macro:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.@cuda" href="#CUDA.@cuda"><code>CUDA.@cuda</code></a> — <span class="docstring-category">Macro</span></header><section><div><pre><code class="language-julia hljs">@cuda [kwargs...] func(args...)</code></pre><p>High-level interface for executing code on a GPU. The <code>@cuda</code> macro should prefix a call, with <code>func</code> a callable function or object that should return nothing. It will be compiled to a CUDA function upon first use, and to a certain extent arguments will be converted and managed automatically using <code>cudaconvert</code>. Finally, a call to <code>cudacall</code> is performed, scheduling a kernel launch on the current CUDA context.</p><p>Several keyword arguments are supported that influence the behavior of <code>@cuda</code>.</p><ul><li><code>launch</code>: whether to launch this kernel, defaults to <code>true</code>. If <code>false</code> the returned kernel object should be launched by calling it and passing arguments again.</li><li><code>dynamic</code>: use dynamic parallelism to launch device-side kernels, defaults to <code>false</code>.</li><li>arguments that influence kernel compilation: see <a href="#CUDA.cufunction"><code>cufunction</code></a> and <a href="../kernel/#CUDA.dynamic_cufunction"><code>dynamic_cufunction</code></a></li><li>arguments that influence kernel launch: see <a href="#CUDA.HostKernel"><code>CUDA.HostKernel</code></a> and <a href="../kernel/#CUDA.DeviceKernel"><code>CUDA.DeviceKernel</code></a></li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/compiler/execution.jl#L13-L30">source</a></section></article><p>If needed, you can use a lower-level API that lets you inspect the compiler kernel:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.cudaconvert" href="#CUDA.cudaconvert"><code>CUDA.cudaconvert</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">cudaconvert(x)</code></pre><p>This function is called for every argument to be passed to a kernel, allowing it to be converted to a GPU-friendly format. By default, the function does nothing and returns the input object <code>x</code> as-is.</p><p>Do not add methods to this function, but instead extend the underlying Adapt.jl package and register methods for the the <code>CUDA.KernelAdaptor</code> type.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/compiler/execution.jl#L188-L197">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.cufunction" href="#CUDA.cufunction"><code>CUDA.cufunction</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">cufunction(f, tt=Tuple{}; kwargs...)</code></pre><p>Low-level interface to compile a function invocation for the currently-active GPU, returning a callable kernel object. For a higher-level interface, use <a href="#CUDA.@cuda"><code>@cuda</code></a>.</p><p>The following keyword arguments are supported:</p><ul><li><code>minthreads</code>: the required number of threads in a thread block</li><li><code>maxthreads</code>: the maximum number of threads in a thread block</li><li><code>blocks_per_sm</code>: a minimum number of thread blocks to be scheduled on a single multiprocessor</li><li><code>maxregs</code>: the maximum number of registers to be allocated to a single thread (only supported on LLVM 4.0+)</li><li><code>name</code>: override the name that the kernel will have in the generated code</li><li><code>always_inline</code>: inline all function calls in the kernel</li><li><code>fastmath</code>: use less precise square roots and flush denormals</li><li><code>cap</code> and <code>ptx</code>: to override the compute capability and PTX version to compile for</li></ul><p>The output of this function is automatically cached, i.e. you can simply call <code>cufunction</code> in a hot path without degrading performance. New code will be generated automatically, when when function changes, or when different types or keyword arguments are provided.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/compiler/execution.jl#L339-L360">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.HostKernel" href="#CUDA.HostKernel"><code>CUDA.HostKernel</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">(::HostKernel)(args...; kwargs...)
-(::DeviceKernel)(args...; kwargs...)</code></pre><p>Low-level interface to call a compiled kernel, passing GPU-compatible arguments in <code>args</code>. For a higher-level interface, use <a href="#CUDA.@cuda"><code>@cuda</code></a>.</p><p>A <code>HostKernel</code> is callable on the host, and a <code>DeviceKernel</code> is callable on the device (created by <code>@cuda</code> with <code>dynamic=true</code>).</p><p>The following keyword arguments are supported:</p><ul><li><code>threads</code> (default: <code>1</code>): Number of threads per block, or a 1-, 2- or 3-tuple of dimensions (e.g. <code>threads=(32, 32)</code> for a 2D block of 32×32 threads). Use <a href="../kernel/#CUDA.threadIdx"><code>threadIdx()</code></a> and <a href="../kernel/#CUDA.blockDim"><code>blockDim()</code></a> to query from within the kernel.</li><li><code>blocks</code> (default: <code>1</code>): Number of thread blocks to launch, or a 1-, 2- or 3-tuple of dimensions (e.g. <code>blocks=(2, 4, 2)</code> for a 3D grid of blocks). Use <a href="../kernel/#CUDA.blockIdx"><code>blockIdx()</code></a> and <a href="../kernel/#CUDA.gridDim"><code>gridDim()</code></a> to query from within the kernel.</li><li><code>shmem</code>(default: <code>0</code>): Amount of dynamic shared memory in bytes to allocate per thread block; used by <a href="../kernel/#CUDA.CuDynamicSharedArray"><code>CuDynamicSharedArray</code></a>.</li><li><code>stream</code> (default: <a href="../essentials/#CUDA.stream"><code>stream()</code></a>): <a href="../../lib/driver/#CUDA.CuStream"><code>CuStream</code></a> to launch the kernel on.</li><li><code>cooperative</code> (default: <code>false</code>): whether to launch a cooperative kernel that supports grid synchronization (see <a href="../kernel/#CUDA.CG.this_grid"><code>CG.this_grid</code></a> and <a href="../kernel/#CUDA.CG.sync"><code>CG.sync</code></a>). Note that this requires care wrt. the number of blocks launched.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/compiler/execution.jl#L285">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.version" href="#CUDA.version"><code>CUDA.version</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">version(k::HostKernel)</code></pre><p>Queries the PTX and SM versions a kernel was compiled for. Returns a named tuple.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/compiler/execution.jl#L287-L292">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.maxthreads" href="#CUDA.maxthreads"><code>CUDA.maxthreads</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">maxthreads(k::HostKernel)</code></pre><p>Queries the maximum amount of threads a kernel can use in a single block.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/compiler/execution.jl#L324-L328">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.registers" href="#CUDA.registers"><code>CUDA.registers</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">registers(k::HostKernel)</code></pre><p>Queries the register usage of a kernel.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/compiler/execution.jl#L314-L318">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.memory" href="#CUDA.memory"><code>CUDA.memory</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">memory(k::HostKernel)</code></pre><p>Queries the local, shared and constant memory usage of a compiled kernel in bytes. Returns a named tuple.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/compiler/execution.jl#L300-L305">source</a></section></article><h2 id="Reflection"><a class="docs-heading-anchor" href="#Reflection">Reflection</a><a id="Reflection-1"></a><a class="docs-heading-anchor-permalink" href="#Reflection" title="Permalink"></a></h2><p>If you want to inspect generated code, you can use macros that resemble functionality from the InteractiveUtils standard library:</p><pre><code class="nohighlight hljs">@device_code_lowered
+</script><script data-outdated-warner src="../../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL="../.."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../../assets/documenter.js"></script><script src="../../search_index.js"></script><script src="../../siteinfo.js"></script><script src="../../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../../assets/themeswap.js"></script><link href="../../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../../"><img src="../../assets/logo.png" alt="CUDA.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../../">CUDA.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../../">Home</a></li><li><span class="tocitem">Tutorials</span><ul><li><a class="tocitem" href="../../tutorials/introduction/">Introduction</a></li><li><a class="tocitem" href="../../tutorials/custom_structs/">Using custom structs</a></li><li><a class="tocitem" href="../../tutorials/performance/">Performance Tips</a></li></ul></li><li><span class="tocitem">Installation</span><ul><li><a class="tocitem" href="../../installation/overview/">Overview</a></li><li><a class="tocitem" href="../../installation/conditional/">Conditional use</a></li><li><a class="tocitem" href="../../installation/troubleshooting/">Troubleshooting</a></li></ul></li><li><span class="tocitem">Usage</span><ul><li><a class="tocitem" href="../../usage/overview/">Overview</a></li><li><a class="tocitem" href="../../usage/workflow/">Workflow</a></li><li><a class="tocitem" href="../../usage/array/">Array programming</a></li><li><a class="tocitem" href="../../usage/memory/">Memory management</a></li><li><a class="tocitem" href="../../usage/multitasking/">Tasks and threads</a></li><li><a class="tocitem" href="../../usage/multigpu/">Multiple GPUs</a></li></ul></li><li><span class="tocitem">Development</span><ul><li><a class="tocitem" href="../../development/profiling/">Benchmarking &amp; profiling</a></li><li><a class="tocitem" href="../../development/kernel/">Kernel programming</a></li><li><a class="tocitem" href="../../development/troubleshooting/">Troubleshooting</a></li><li><a class="tocitem" href="../../development/debugging/">Debugging</a></li></ul></li><li><span class="tocitem">API reference</span><ul><li><a class="tocitem" href="../essentials/">Essentials</a></li><li><a class="tocitem" href="../array/">Array programming</a></li><li><a class="tocitem" href="../kernel/">Kernel programming</a></li><li class="is-active"><a class="tocitem" href>Compiler</a><ul class="internal"><li><a class="tocitem" href="#Execution"><span>Execution</span></a></li><li><a class="tocitem" href="#Reflection"><span>Reflection</span></a></li></ul></li></ul></li><li><span class="tocitem">Library reference</span><ul><li><a class="tocitem" href="../../lib/driver/">CUDA driver</a></li></ul></li><li><a class="tocitem" href="../../faq/">FAQ</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li><a class="is-disabled">API reference</a></li><li class="is-active"><a href>Compiler</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Compiler</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/CUDA.jl/blob/master/docs/src/api/compiler.md#" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Compiler"><a class="docs-heading-anchor" href="#Compiler">Compiler</a><a id="Compiler-1"></a><a class="docs-heading-anchor-permalink" href="#Compiler" title="Permalink"></a></h1><h2 id="Execution"><a class="docs-heading-anchor" href="#Execution">Execution</a><a id="Execution-1"></a><a class="docs-heading-anchor-permalink" href="#Execution" title="Permalink"></a></h2><p>The main entry-point to the compiler is the <code>@cuda</code> macro:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.@cuda" href="#CUDA.@cuda"><code>CUDA.@cuda</code></a> — <span class="docstring-category">Macro</span></header><section><div><pre><code class="language-julia hljs">@cuda [kwargs...] func(args...)</code></pre><p>High-level interface for executing code on a GPU. The <code>@cuda</code> macro should prefix a call, with <code>func</code> a callable function or object that should return nothing. It will be compiled to a CUDA function upon first use, and to a certain extent arguments will be converted and managed automatically using <code>cudaconvert</code>. Finally, a call to <code>cudacall</code> is performed, scheduling a kernel launch on the current CUDA context.</p><p>Several keyword arguments are supported that influence the behavior of <code>@cuda</code>.</p><ul><li><code>launch</code>: whether to launch this kernel, defaults to <code>true</code>. If <code>false</code> the returned kernel object should be launched by calling it and passing arguments again.</li><li><code>dynamic</code>: use dynamic parallelism to launch device-side kernels, defaults to <code>false</code>.</li><li>arguments that influence kernel compilation: see <a href="#CUDA.cufunction"><code>cufunction</code></a> and <a href="../kernel/#CUDA.dynamic_cufunction"><code>dynamic_cufunction</code></a></li><li>arguments that influence kernel launch: see <a href="#CUDA.HostKernel"><code>CUDA.HostKernel</code></a> and <a href="../kernel/#CUDA.DeviceKernel"><code>CUDA.DeviceKernel</code></a></li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/compiler/execution.jl#L13-L30">source</a></section></article><p>If needed, you can use a lower-level API that lets you inspect the compiler kernel:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.cudaconvert" href="#CUDA.cudaconvert"><code>CUDA.cudaconvert</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">cudaconvert(x)</code></pre><p>This function is called for every argument to be passed to a kernel, allowing it to be converted to a GPU-friendly format. By default, the function does nothing and returns the input object <code>x</code> as-is.</p><p>Do not add methods to this function, but instead extend the underlying Adapt.jl package and register methods for the the <code>CUDA.KernelAdaptor</code> type.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/compiler/execution.jl#L188-L197">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.cufunction" href="#CUDA.cufunction"><code>CUDA.cufunction</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">cufunction(f, tt=Tuple{}; kwargs...)</code></pre><p>Low-level interface to compile a function invocation for the currently-active GPU, returning a callable kernel object. For a higher-level interface, use <a href="#CUDA.@cuda"><code>@cuda</code></a>.</p><p>The following keyword arguments are supported:</p><ul><li><code>minthreads</code>: the required number of threads in a thread block</li><li><code>maxthreads</code>: the maximum number of threads in a thread block</li><li><code>blocks_per_sm</code>: a minimum number of thread blocks to be scheduled on a single multiprocessor</li><li><code>maxregs</code>: the maximum number of registers to be allocated to a single thread (only supported on LLVM 4.0+)</li><li><code>name</code>: override the name that the kernel will have in the generated code</li><li><code>always_inline</code>: inline all function calls in the kernel</li><li><code>fastmath</code>: use less precise square roots and flush denormals</li><li><code>cap</code> and <code>ptx</code>: to override the compute capability and PTX version to compile for</li></ul><p>The output of this function is automatically cached, i.e. you can simply call <code>cufunction</code> in a hot path without degrading performance. New code will be generated automatically, when when function changes, or when different types or keyword arguments are provided.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/compiler/execution.jl#L339-L360">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.HostKernel" href="#CUDA.HostKernel"><code>CUDA.HostKernel</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">(::HostKernel)(args...; kwargs...)
+(::DeviceKernel)(args...; kwargs...)</code></pre><p>Low-level interface to call a compiled kernel, passing GPU-compatible arguments in <code>args</code>. For a higher-level interface, use <a href="#CUDA.@cuda"><code>@cuda</code></a>.</p><p>A <code>HostKernel</code> is callable on the host, and a <code>DeviceKernel</code> is callable on the device (created by <code>@cuda</code> with <code>dynamic=true</code>).</p><p>The following keyword arguments are supported:</p><ul><li><code>threads</code> (default: <code>1</code>): Number of threads per block, or a 1-, 2- or 3-tuple of dimensions (e.g. <code>threads=(32, 32)</code> for a 2D block of 32×32 threads). Use <a href="../kernel/#CUDA.threadIdx"><code>threadIdx()</code></a> and <a href="../kernel/#CUDA.blockDim"><code>blockDim()</code></a> to query from within the kernel.</li><li><code>blocks</code> (default: <code>1</code>): Number of thread blocks to launch, or a 1-, 2- or 3-tuple of dimensions (e.g. <code>blocks=(2, 4, 2)</code> for a 3D grid of blocks). Use <a href="../kernel/#CUDA.blockIdx"><code>blockIdx()</code></a> and <a href="../kernel/#CUDA.gridDim"><code>gridDim()</code></a> to query from within the kernel.</li><li><code>shmem</code>(default: <code>0</code>): Amount of dynamic shared memory in bytes to allocate per thread block; used by <a href="../kernel/#CUDA.CuDynamicSharedArray"><code>CuDynamicSharedArray</code></a>.</li><li><code>stream</code> (default: <a href="../essentials/#CUDA.stream"><code>stream()</code></a>): <a href="../../lib/driver/#CUDA.CuStream"><code>CuStream</code></a> to launch the kernel on.</li><li><code>cooperative</code> (default: <code>false</code>): whether to launch a cooperative kernel that supports grid synchronization (see <a href="../kernel/#CUDA.CG.this_grid"><code>CG.this_grid</code></a> and <a href="../kernel/#CUDA.CG.sync"><code>CG.sync</code></a>). Note that this requires care wrt. the number of blocks launched.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/compiler/execution.jl#L285">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.version" href="#CUDA.version"><code>CUDA.version</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">version(k::HostKernel)</code></pre><p>Queries the PTX and SM versions a kernel was compiled for. Returns a named tuple.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/compiler/execution.jl#L287-L292">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.maxthreads" href="#CUDA.maxthreads"><code>CUDA.maxthreads</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">maxthreads(k::HostKernel)</code></pre><p>Queries the maximum amount of threads a kernel can use in a single block.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/compiler/execution.jl#L324-L328">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.registers" href="#CUDA.registers"><code>CUDA.registers</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">registers(k::HostKernel)</code></pre><p>Queries the register usage of a kernel.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/compiler/execution.jl#L314-L318">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.memory" href="#CUDA.memory"><code>CUDA.memory</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">memory(k::HostKernel)</code></pre><p>Queries the local, shared and constant memory usage of a compiled kernel in bytes. Returns a named tuple.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/compiler/execution.jl#L300-L305">source</a></section></article><h2 id="Reflection"><a class="docs-heading-anchor" href="#Reflection">Reflection</a><a id="Reflection-1"></a><a class="docs-heading-anchor-permalink" href="#Reflection" title="Permalink"></a></h2><p>If you want to inspect generated code, you can use macros that resemble functionality from the InteractiveUtils standard library:</p><pre><code class="nohighlight hljs">@device_code_lowered
 @device_code_typed
 @device_code_warntype
 @device_code_llvm
@@ -14,5 +14,5 @@
 CUDA.code_warntype
 CUDA.code_llvm
 CUDA.code_ptx
-CUDA.code_sass</code></pre><p>For more information, please consult the GPUCompiler.jl documentation. Only the <code>code_sass</code> functionality is actually defined in CUDA.jl:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.@device_code_sass" href="#CUDA.@device_code_sass"><code>CUDA.@device_code_sass</code></a> — <span class="docstring-category">Macro</span></header><section><div><pre><code class="language-julia hljs">@device_code_sass [io::IO=stdout, ...] ex</code></pre><p>Evaluates the expression <code>ex</code> and prints the result of <a href="#CUDA.code_sass"><code>CUDA.code_sass</code></a> to <code>io</code> for every executed CUDA kernel. For other supported keywords, see <a href="#CUDA.code_sass"><code>CUDA.code_sass</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/compiler/reflection.jl#L207-L213">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.code_sass" href="#CUDA.code_sass"><code>CUDA.code_sass</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">code_sass([io], f, types; raw=false)
-code_sass(f, [io]; raw=false)</code></pre><p>Prints the SASS code corresponding to one or more CUDA modules to <code>io</code>, which defaults to <code>stdout</code>.</p><p>If providing both <code>f</code> and <code>types</code>, it is assumed that this uniquely identifies a kernel function, for which SASS code will be generated, and printed to <code>io</code>.</p><p>If only providing a callable function <code>f</code>, typically specified using the <code>do</code> syntax, the SASS code for all modules executed during evaluation of <code>f</code> will be printed. This can be convenient to display the SASS code for functions whose source code is not available.</p><ul><li><code>raw</code>: dump the assembly like <code>nvdisasm</code> reports it, without post-processing;</li><li>in the case of specifying <code>f</code> and <code>types</code>: all keyword arguments from <a href="#CUDA.cufunction"><code>cufunction</code></a></li></ul><p>See also: <a href="#CUDA.@device_code_sass"><code>@device_code_sass</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/compiler/reflection.jl#L12-L30">source</a></section></article></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../kernel/">« Kernel programming</a><a class="docs-footer-nextpage" href="../../lib/driver/">CUDA driver »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Monday 26 August 2024 07:38">Monday 26 August 2024</span>. Using Julia version 1.10.4.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+CUDA.code_sass</code></pre><p>For more information, please consult the GPUCompiler.jl documentation. Only the <code>code_sass</code> functionality is actually defined in CUDA.jl:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.@device_code_sass" href="#CUDA.@device_code_sass"><code>CUDA.@device_code_sass</code></a> — <span class="docstring-category">Macro</span></header><section><div><pre><code class="language-julia hljs">@device_code_sass [io::IO=stdout, ...] ex</code></pre><p>Evaluates the expression <code>ex</code> and prints the result of <a href="#CUDA.code_sass"><code>CUDA.code_sass</code></a> to <code>io</code> for every executed CUDA kernel. For other supported keywords, see <a href="#CUDA.code_sass"><code>CUDA.code_sass</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/compiler/reflection.jl#L207-L213">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.code_sass" href="#CUDA.code_sass"><code>CUDA.code_sass</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">code_sass([io], f, types; raw=false)
+code_sass(f, [io]; raw=false)</code></pre><p>Prints the SASS code corresponding to one or more CUDA modules to <code>io</code>, which defaults to <code>stdout</code>.</p><p>If providing both <code>f</code> and <code>types</code>, it is assumed that this uniquely identifies a kernel function, for which SASS code will be generated, and printed to <code>io</code>.</p><p>If only providing a callable function <code>f</code>, typically specified using the <code>do</code> syntax, the SASS code for all modules executed during evaluation of <code>f</code> will be printed. This can be convenient to display the SASS code for functions whose source code is not available.</p><ul><li><code>raw</code>: dump the assembly like <code>nvdisasm</code> reports it, without post-processing;</li><li>in the case of specifying <code>f</code> and <code>types</code>: all keyword arguments from <a href="#CUDA.cufunction"><code>cufunction</code></a></li></ul><p>See also: <a href="#CUDA.@device_code_sass"><code>@device_code_sass</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/compiler/reflection.jl#L12-L30">source</a></section></article></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../kernel/">« Kernel programming</a><a class="docs-footer-nextpage" href="../../lib/driver/">CUDA driver »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Wednesday 4 September 2024 18:14">Wednesday 4 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/api/essentials/index.html b/dev/api/essentials/index.html
index 425a0c1452..1ce41181d8 100644
--- a/dev/api/essentials/index.html
+++ b/dev/api/essentials/index.html
@@ -3,8 +3,8 @@
   function gtag(){dataLayer.push(arguments);}
   gtag('js', new Date());
   gtag('config', 'UA-154489943-2', {'page_path': location.pathname + location.search + location.hash});
-</script><script data-outdated-warner src="../../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL="../.."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../../assets/documenter.js"></script><script src="../../search_index.js"></script><script src="../../siteinfo.js"></script><script src="../../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../../assets/themeswap.js"></script><link href="../../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../../"><img src="../../assets/logo.png" alt="CUDA.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../../">CUDA.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../../">Home</a></li><li><span class="tocitem">Tutorials</span><ul><li><a class="tocitem" href="../../tutorials/introduction/">Introduction</a></li><li><a class="tocitem" href="../../tutorials/custom_structs/">Using custom structs</a></li><li><a class="tocitem" href="../../tutorials/performance/">Performance Tips</a></li></ul></li><li><span class="tocitem">Installation</span><ul><li><a class="tocitem" href="../../installation/overview/">Overview</a></li><li><a class="tocitem" href="../../installation/conditional/">Conditional use</a></li><li><a class="tocitem" href="../../installation/troubleshooting/">Troubleshooting</a></li></ul></li><li><span class="tocitem">Usage</span><ul><li><a class="tocitem" href="../../usage/overview/">Overview</a></li><li><a class="tocitem" href="../../usage/workflow/">Workflow</a></li><li><a class="tocitem" href="../../usage/array/">Array programming</a></li><li><a class="tocitem" href="../../usage/memory/">Memory management</a></li><li><a class="tocitem" href="../../usage/multitasking/">Tasks and threads</a></li><li><a class="tocitem" href="../../usage/multigpu/">Multiple GPUs</a></li></ul></li><li><span class="tocitem">Development</span><ul><li><a class="tocitem" href="../../development/profiling/">Benchmarking &amp; profiling</a></li><li><a class="tocitem" href="../../development/kernel/">Kernel programming</a></li><li><a class="tocitem" href="../../development/troubleshooting/">Troubleshooting</a></li><li><a class="tocitem" href="../../development/debugging/">Debugging</a></li></ul></li><li><span class="tocitem">API reference</span><ul><li class="is-active"><a class="tocitem" href>Essentials</a><ul class="internal"><li><a class="tocitem" href="#Initialization"><span>Initialization</span></a></li><li><a class="tocitem" href="#Global-state"><span>Global state</span></a></li></ul></li><li><a class="tocitem" href="../array/">Array programming</a></li><li><a class="tocitem" href="../kernel/">Kernel programming</a></li><li><a class="tocitem" href="../compiler/">Compiler</a></li></ul></li><li><span class="tocitem">Library reference</span><ul><li><a class="tocitem" href="../../lib/driver/">CUDA driver</a></li></ul></li><li><a class="tocitem" href="../../faq/">FAQ</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li><a class="is-disabled">API reference</a></li><li class="is-active"><a href>Essentials</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Essentials</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/CUDA.jl/blob/master/docs/src/api/essentials.md#" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Essentials"><a class="docs-heading-anchor" href="#Essentials">Essentials</a><a id="Essentials-1"></a><a class="docs-heading-anchor-permalink" href="#Essentials" title="Permalink"></a></h1><h2 id="Initialization"><a class="docs-heading-anchor" href="#Initialization">Initialization</a><a id="Initialization-1"></a><a class="docs-heading-anchor-permalink" href="#Initialization" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.functional-Tuple{Bool}" href="#CUDA.functional-Tuple{Bool}"><code>CUDA.functional</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">functional(show_reason=false)</code></pre><p>Check if the package has been configured successfully and is ready to use.</p><p>This call is intended for packages that support conditionally using an available GPU. If you fail to check whether CUDA is functional, actual use of functionality might warn and error.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/initialization.jl#L12-L19">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.has_cuda" href="#CUDA.has_cuda"><code>CUDA.has_cuda</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">has_cuda()::Bool</code></pre><p>Check whether the local system provides an installation of the CUDA driver and runtime. Use this function if your code loads packages that require CUDA.jl. ```</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/initialization.jl#L239-L245">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.has_cuda_gpu" href="#CUDA.has_cuda_gpu"><code>CUDA.has_cuda_gpu</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">has_cuda_gpu()::Bool</code></pre><p>Check whether the local system provides an installation of the CUDA driver and runtime, and if it contains a CUDA-capable GPU. See <a href="#CUDA.has_cuda"><code>has_cuda</code></a> for more details.</p><p>Note that this function initializes the CUDA API in order to check for the number of GPUs.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/initialization.jl#L248-L255">source</a></section></article><h2 id="Global-state"><a class="docs-heading-anchor" href="#Global-state">Global state</a><a id="Global-state-1"></a><a class="docs-heading-anchor-permalink" href="#Global-state" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.context" href="#CUDA.context"><code>CUDA.context</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">context(ptr)</code></pre><p>Identify the context memory was allocated in.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/memory.jl#L794-L798">source</a></section><section><div><pre><code class="language-julia hljs">context()::CuContext</code></pre><p>Get or create a CUDA context for the current thread (as opposed to <code>current_context</code> which may return <code>nothing</code> if there is no context bound to the current thread).</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/state.jl#L118-L124">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.context!" href="#CUDA.context!"><code>CUDA.context!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">context!(ctx::CuContext)
-context!(ctx::CuContext) do ... end</code></pre><p>Bind the current host thread to the context <code>ctx</code>. Returns the previously-bound context. If used with do-block syntax, the change is only temporary.</p><p>Note that the contexts used with this call should be previously acquired by calling <a href="#CUDA.context"><code>context</code></a>, and not arbitrary contexts created by calling the <code>CuContext</code> constructor.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/state.jl#L129-L139">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.device" href="#CUDA.device"><code>CUDA.device</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">device(::CuContext)</code></pre><p>Returns the device for a context.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/context.jl#L280-L284">source</a></section><section><div><pre><code class="language-julia hljs">device(ptr)</code></pre><p>Identify the device memory was allocated on.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/memory.jl#L802-L806">source</a></section><section><div><pre><code class="language-julia hljs">device()::CuDevice</code></pre><p>Get the CUDA device for the current thread, similar to how <a href="#CUDA.context"><code>context()</code></a> works compared to <a href="../../lib/driver/#CUDA.current_context"><code>current_context()</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/state.jl#L182-L187">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.device!" href="#CUDA.device!"><code>CUDA.device!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">device!(dev::Integer)
+</script><script data-outdated-warner src="../../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL="../.."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../../assets/documenter.js"></script><script src="../../search_index.js"></script><script src="../../siteinfo.js"></script><script src="../../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../../assets/themeswap.js"></script><link href="../../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../../"><img src="../../assets/logo.png" alt="CUDA.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../../">CUDA.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../../">Home</a></li><li><span class="tocitem">Tutorials</span><ul><li><a class="tocitem" href="../../tutorials/introduction/">Introduction</a></li><li><a class="tocitem" href="../../tutorials/custom_structs/">Using custom structs</a></li><li><a class="tocitem" href="../../tutorials/performance/">Performance Tips</a></li></ul></li><li><span class="tocitem">Installation</span><ul><li><a class="tocitem" href="../../installation/overview/">Overview</a></li><li><a class="tocitem" href="../../installation/conditional/">Conditional use</a></li><li><a class="tocitem" href="../../installation/troubleshooting/">Troubleshooting</a></li></ul></li><li><span class="tocitem">Usage</span><ul><li><a class="tocitem" href="../../usage/overview/">Overview</a></li><li><a class="tocitem" href="../../usage/workflow/">Workflow</a></li><li><a class="tocitem" href="../../usage/array/">Array programming</a></li><li><a class="tocitem" href="../../usage/memory/">Memory management</a></li><li><a class="tocitem" href="../../usage/multitasking/">Tasks and threads</a></li><li><a class="tocitem" href="../../usage/multigpu/">Multiple GPUs</a></li></ul></li><li><span class="tocitem">Development</span><ul><li><a class="tocitem" href="../../development/profiling/">Benchmarking &amp; profiling</a></li><li><a class="tocitem" href="../../development/kernel/">Kernel programming</a></li><li><a class="tocitem" href="../../development/troubleshooting/">Troubleshooting</a></li><li><a class="tocitem" href="../../development/debugging/">Debugging</a></li></ul></li><li><span class="tocitem">API reference</span><ul><li class="is-active"><a class="tocitem" href>Essentials</a><ul class="internal"><li><a class="tocitem" href="#Initialization"><span>Initialization</span></a></li><li><a class="tocitem" href="#Global-state"><span>Global state</span></a></li></ul></li><li><a class="tocitem" href="../array/">Array programming</a></li><li><a class="tocitem" href="../kernel/">Kernel programming</a></li><li><a class="tocitem" href="../compiler/">Compiler</a></li></ul></li><li><span class="tocitem">Library reference</span><ul><li><a class="tocitem" href="../../lib/driver/">CUDA driver</a></li></ul></li><li><a class="tocitem" href="../../faq/">FAQ</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li><a class="is-disabled">API reference</a></li><li class="is-active"><a href>Essentials</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Essentials</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/CUDA.jl/blob/master/docs/src/api/essentials.md#" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Essentials"><a class="docs-heading-anchor" href="#Essentials">Essentials</a><a id="Essentials-1"></a><a class="docs-heading-anchor-permalink" href="#Essentials" title="Permalink"></a></h1><h2 id="Initialization"><a class="docs-heading-anchor" href="#Initialization">Initialization</a><a id="Initialization-1"></a><a class="docs-heading-anchor-permalink" href="#Initialization" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.functional-Tuple{Bool}" href="#CUDA.functional-Tuple{Bool}"><code>CUDA.functional</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">functional(show_reason=false)</code></pre><p>Check if the package has been configured successfully and is ready to use.</p><p>This call is intended for packages that support conditionally using an available GPU. If you fail to check whether CUDA is functional, actual use of functionality might warn and error.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/initialization.jl#L12-L19">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.has_cuda" href="#CUDA.has_cuda"><code>CUDA.has_cuda</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">has_cuda()::Bool</code></pre><p>Check whether the local system provides an installation of the CUDA driver and runtime. Use this function if your code loads packages that require CUDA.jl. ```</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/initialization.jl#L239-L245">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.has_cuda_gpu" href="#CUDA.has_cuda_gpu"><code>CUDA.has_cuda_gpu</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">has_cuda_gpu()::Bool</code></pre><p>Check whether the local system provides an installation of the CUDA driver and runtime, and if it contains a CUDA-capable GPU. See <a href="#CUDA.has_cuda"><code>has_cuda</code></a> for more details.</p><p>Note that this function initializes the CUDA API in order to check for the number of GPUs.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/initialization.jl#L248-L255">source</a></section></article><h2 id="Global-state"><a class="docs-heading-anchor" href="#Global-state">Global state</a><a id="Global-state-1"></a><a class="docs-heading-anchor-permalink" href="#Global-state" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.context" href="#CUDA.context"><code>CUDA.context</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">context(ptr)</code></pre><p>Identify the context memory was allocated in.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/memory.jl#L794-L798">source</a></section><section><div><pre><code class="language-julia hljs">context()::CuContext</code></pre><p>Get or create a CUDA context for the current thread (as opposed to <code>current_context</code> which may return <code>nothing</code> if there is no context bound to the current thread).</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/state.jl#L118-L124">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.context!" href="#CUDA.context!"><code>CUDA.context!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">context!(ctx::CuContext)
+context!(ctx::CuContext) do ... end</code></pre><p>Bind the current host thread to the context <code>ctx</code>. Returns the previously-bound context. If used with do-block syntax, the change is only temporary.</p><p>Note that the contexts used with this call should be previously acquired by calling <a href="#CUDA.context"><code>context</code></a>, and not arbitrary contexts created by calling the <code>CuContext</code> constructor.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/state.jl#L129-L139">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.device" href="#CUDA.device"><code>CUDA.device</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">device(::CuContext)</code></pre><p>Returns the device for a context.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/context.jl#L280-L284">source</a></section><section><div><pre><code class="language-julia hljs">device(ptr)</code></pre><p>Identify the device memory was allocated on.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/memory.jl#L802-L806">source</a></section><section><div><pre><code class="language-julia hljs">device()::CuDevice</code></pre><p>Get the CUDA device for the current thread, similar to how <a href="#CUDA.context"><code>context()</code></a> works compared to <a href="../../lib/driver/#CUDA.current_context"><code>current_context()</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/state.jl#L182-L187">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.device!" href="#CUDA.device!"><code>CUDA.device!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">device!(dev::Integer)
 device!(dev::CuDevice)
-device!(dev) do ... end</code></pre><p>Sets <code>dev</code> as the current active device for the calling host thread. Devices can be specified by integer id, or as a <code>CuDevice</code> (slightly faster). Both functions can be used with do-block syntax, in which case the device is only changed temporarily, without changing the default device used to initialize new threads or tasks.</p><p>Calling this function at the start of a session will make sure CUDA is initialized (i.e., a primary context will be created and activated).</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/state.jl#L256-L268">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.device_reset!" href="#CUDA.device_reset!"><code>CUDA.device_reset!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">device_reset!(dev::CuDevice=device())</code></pre><p>Reset the CUDA state associated with a device. This call with release the underlying context, at which point any objects allocated in that context will be invalidated.</p><p>Note that this does not guarantee to free up all memory allocations, as many are not bound to a context, so it is generally not useful to call this function to free up memory.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>This function is only reliable on CUDA driver &gt;= v12.0, and may lead to crashes if used on older drivers.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/state.jl#L303-L316">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.stream" href="#CUDA.stream"><code>CUDA.stream</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">stream()</code></pre><p>Get the CUDA stream that should be used as the default one for the currently executing task.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/state.jl#L363-L367">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.stream!" href="#CUDA.stream!"><code>CUDA.stream!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">stream!(::CuStream)
-stream!(::CuStream) do ... end</code></pre><p>Change the default CUDA stream for the currently executing task, temporarily if using the do-block version of this function.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/state.jl#L409-L415">source</a></section></article></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../../development/debugging/">« Debugging</a><a class="docs-footer-nextpage" href="../array/">Array programming »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Monday 26 August 2024 07:38">Monday 26 August 2024</span>. Using Julia version 1.10.4.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+device!(dev) do ... end</code></pre><p>Sets <code>dev</code> as the current active device for the calling host thread. Devices can be specified by integer id, or as a <code>CuDevice</code> (slightly faster). Both functions can be used with do-block syntax, in which case the device is only changed temporarily, without changing the default device used to initialize new threads or tasks.</p><p>Calling this function at the start of a session will make sure CUDA is initialized (i.e., a primary context will be created and activated).</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/state.jl#L256-L268">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.device_reset!" href="#CUDA.device_reset!"><code>CUDA.device_reset!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">device_reset!(dev::CuDevice=device())</code></pre><p>Reset the CUDA state associated with a device. This call with release the underlying context, at which point any objects allocated in that context will be invalidated.</p><p>Note that this does not guarantee to free up all memory allocations, as many are not bound to a context, so it is generally not useful to call this function to free up memory.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>This function is only reliable on CUDA driver &gt;= v12.0, and may lead to crashes if used on older drivers.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/state.jl#L303-L316">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.stream" href="#CUDA.stream"><code>CUDA.stream</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">stream()</code></pre><p>Get the CUDA stream that should be used as the default one for the currently executing task.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/state.jl#L363-L367">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.stream!" href="#CUDA.stream!"><code>CUDA.stream!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">stream!(::CuStream)
+stream!(::CuStream) do ... end</code></pre><p>Change the default CUDA stream for the currently executing task, temporarily if using the do-block version of this function.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/state.jl#L409-L415">source</a></section></article></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../../development/debugging/">« Debugging</a><a class="docs-footer-nextpage" href="../array/">Array programming »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Wednesday 4 September 2024 18:14">Wednesday 4 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/api/kernel/index.html b/dev/api/kernel/index.html
index 472e3b8558..9ab2d9dee5 100644
--- a/dev/api/kernel/index.html
+++ b/dev/api/kernel/index.html
@@ -3,14 +3,14 @@
   function gtag(){dataLayer.push(arguments);}
   gtag('js', new Date());
   gtag('config', 'UA-154489943-2', {'page_path': location.pathname + location.search + location.hash});
-</script><script data-outdated-warner src="../../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL="../.."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../../assets/documenter.js"></script><script src="../../search_index.js"></script><script src="../../siteinfo.js"></script><script src="../../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../../assets/themeswap.js"></script><link href="../../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../../"><img src="../../assets/logo.png" alt="CUDA.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../../">CUDA.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../../">Home</a></li><li><span class="tocitem">Tutorials</span><ul><li><a class="tocitem" href="../../tutorials/introduction/">Introduction</a></li><li><a class="tocitem" href="../../tutorials/custom_structs/">Using custom structs</a></li><li><a class="tocitem" href="../../tutorials/performance/">Performance Tips</a></li></ul></li><li><span class="tocitem">Installation</span><ul><li><a class="tocitem" href="../../installation/overview/">Overview</a></li><li><a class="tocitem" href="../../installation/conditional/">Conditional use</a></li><li><a class="tocitem" href="../../installation/troubleshooting/">Troubleshooting</a></li></ul></li><li><span class="tocitem">Usage</span><ul><li><a class="tocitem" href="../../usage/overview/">Overview</a></li><li><a class="tocitem" href="../../usage/workflow/">Workflow</a></li><li><a class="tocitem" href="../../usage/array/">Array programming</a></li><li><a class="tocitem" href="../../usage/memory/">Memory management</a></li><li><a class="tocitem" href="../../usage/multitasking/">Tasks and threads</a></li><li><a class="tocitem" href="../../usage/multigpu/">Multiple GPUs</a></li></ul></li><li><span class="tocitem">Development</span><ul><li><a class="tocitem" href="../../development/profiling/">Benchmarking &amp; profiling</a></li><li><a class="tocitem" href="../../development/kernel/">Kernel programming</a></li><li><a class="tocitem" href="../../development/troubleshooting/">Troubleshooting</a></li><li><a class="tocitem" href="../../development/debugging/">Debugging</a></li></ul></li><li><span class="tocitem">API reference</span><ul><li><a class="tocitem" href="../essentials/">Essentials</a></li><li><a class="tocitem" href="../array/">Array programming</a></li><li class="is-active"><a class="tocitem" href>Kernel programming</a><ul class="internal"><li><a class="tocitem" href="#Indexing-and-dimensions"><span>Indexing and dimensions</span></a></li><li><a class="tocitem" href="#Device-arrays"><span>Device arrays</span></a></li><li><a class="tocitem" href="#Memory-types"><span>Memory types</span></a></li><li><a class="tocitem" href="#Synchronization"><span>Synchronization</span></a></li><li><a class="tocitem" href="#Time-functions"><span>Time functions</span></a></li><li><a class="tocitem" href="#Warp-level-functions"><span>Warp-level functions</span></a></li><li><a class="tocitem" href="#Formatted-Output"><span>Formatted Output</span></a></li><li><a class="tocitem" href="#Assertions"><span>Assertions</span></a></li><li><a class="tocitem" href="#Atomics"><span>Atomics</span></a></li><li><a class="tocitem" href="#Dynamic-parallelism"><span>Dynamic parallelism</span></a></li><li><a class="tocitem" href="#Cooperative-groups"><span>Cooperative groups</span></a></li><li><a class="tocitem" href="#Data-transfer"><span>Data transfer</span></a></li><li><a class="tocitem" href="#Math"><span>Math</span></a></li><li><a class="tocitem" href="#WMMA"><span>WMMA</span></a></li><li><a class="tocitem" href="#Other"><span>Other</span></a></li></ul></li><li><a class="tocitem" href="../compiler/">Compiler</a></li></ul></li><li><span class="tocitem">Library reference</span><ul><li><a class="tocitem" href="../../lib/driver/">CUDA driver</a></li></ul></li><li><a class="tocitem" href="../../faq/">FAQ</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li><a class="is-disabled">API reference</a></li><li class="is-active"><a href>Kernel programming</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Kernel programming</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/CUDA.jl/blob/master/docs/src/api/kernel.md#" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="KernelAPI"><a class="docs-heading-anchor" href="#KernelAPI">Kernel programming</a><a id="KernelAPI-1"></a><a class="docs-heading-anchor-permalink" href="#KernelAPI" title="Permalink"></a></h1><p>This section lists the package&#39;s public functionality that corresponds to special CUDA functions for use in device code. It is loosely organized according to the <a href="http://docs.nvidia.com/cuda/cuda-c-programming-guide/#c-language-extensions">C language extensions</a> appendix from the CUDA C programming guide. For more information about certain intrinsics, refer to the aforementioned NVIDIA documentation.</p><h2 id="Indexing-and-dimensions"><a class="docs-heading-anchor" href="#Indexing-and-dimensions">Indexing and dimensions</a><a id="Indexing-and-dimensions-1"></a><a class="docs-heading-anchor-permalink" href="#Indexing-and-dimensions" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.gridDim" href="#CUDA.gridDim"><code>CUDA.gridDim</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">gridDim()::NamedTuple</code></pre><p>Returns the dimensions of the grid.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/indexing.jl#L66-L70">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.blockIdx" href="#CUDA.blockIdx"><code>CUDA.blockIdx</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">blockIdx()::NamedTuple</code></pre><p>Returns the block index within the grid.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/indexing.jl#L73-L77">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.blockDim" href="#CUDA.blockDim"><code>CUDA.blockDim</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">blockDim()::NamedTuple</code></pre><p>Returns the dimensions of the block.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/indexing.jl#L80-L84">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.threadIdx" href="#CUDA.threadIdx"><code>CUDA.threadIdx</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">threadIdx()::NamedTuple</code></pre><p>Returns the thread index within the block.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/indexing.jl#L87-L91">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.warpsize" href="#CUDA.warpsize"><code>CUDA.warpsize</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">warpsize(dev::CuDevice)</code></pre><p>Returns the warp size (in threads) of the device.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/devices.jl#L172-L176">source</a></section><section><div><pre><code class="language-julia hljs">warpsize()::Int32</code></pre><p>Returns the warp size (in threads).</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/indexing.jl#L94-L98">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.laneid" href="#CUDA.laneid"><code>CUDA.laneid</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">laneid()::Int32</code></pre><p>Returns the thread&#39;s lane within the warp.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/indexing.jl#L101-L105">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.active_mask" href="#CUDA.active_mask"><code>CUDA.active_mask</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">active_mask()</code></pre><p>Returns a 32-bit mask indicating which threads in a warp are active with the current executing thread.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/indexing.jl#L130-L135">source</a></section></article><h2 id="Device-arrays"><a class="docs-heading-anchor" href="#Device-arrays">Device arrays</a><a id="Device-arrays-1"></a><a class="docs-heading-anchor-permalink" href="#Device-arrays" title="Permalink"></a></h2><p>CUDA.jl provides a primitive, lightweight array type to manage GPU data organized in an plain, dense fashion. This is the device-counterpart to the <code>CuArray</code>, and implements (part of) the array interface as well as other functionality for use <em>on</em> the GPU:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuDeviceArray" href="#CUDA.CuDeviceArray"><code>CUDA.CuDeviceArray</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuDeviceArray{T,N,A}(ptr, dims, [maxsize])</code></pre><p>Construct an <code>N</code>-dimensional dense CUDA device array with element type <code>T</code> wrapping a pointer, where <code>N</code> is determined from the length of <code>dims</code> and <code>T</code> is determined from the type of <code>ptr</code>. <code>dims</code> may be a single scalar, or a tuple of integers corresponding to the lengths in each dimension). If the rank <code>N</code> is supplied explicitly as in <code>Array{T,N}(dims)</code>, then it must match the length of <code>dims</code>. The same applies to the element type <code>T</code>, which should match the type of the pointer <code>ptr</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/array.jl#L8-L17">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.Const" href="#CUDA.Const"><code>CUDA.Const</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">Const(A::CuDeviceArray)</code></pre><p>Mark a CuDeviceArray as constant/read-only. The invariant guaranteed is that you will not modify an CuDeviceArray for the duration of the current kernel.</p><p>This API can only be used on devices with compute capability 3.5 or higher.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>Experimental API. Subject to change without deprecation.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/array.jl#L185-L195">source</a></section></article><h2 id="Memory-types"><a class="docs-heading-anchor" href="#Memory-types">Memory types</a><a id="Memory-types-1"></a><a class="docs-heading-anchor-permalink" href="#Memory-types" title="Permalink"></a></h2><h3 id="Shared-memory"><a class="docs-heading-anchor" href="#Shared-memory">Shared memory</a><a id="Shared-memory-1"></a><a class="docs-heading-anchor-permalink" href="#Shared-memory" title="Permalink"></a></h3><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuStaticSharedArray" href="#CUDA.CuStaticSharedArray"><code>CUDA.CuStaticSharedArray</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">CuStaticSharedArray(T::Type, dims) -&gt; CuDeviceArray{T,N,AS.Shared}</code></pre><p>Get an array of type <code>T</code> and dimensions <code>dims</code> (either an integer length or tuple shape) pointing to a statically-allocated piece of shared memory. The type should be statically inferable and the dimensions should be constant, or an error will be thrown and the generator function will be called dynamically.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/shared_memory.jl#L5-L12">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuDynamicSharedArray" href="#CUDA.CuDynamicSharedArray"><code>CUDA.CuDynamicSharedArray</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">CuDynamicSharedArray(T::Type, dims, offset::Integer=0) -&gt; CuDeviceArray{T,N,AS.Shared}</code></pre><p>Get an array of type <code>T</code> and dimensions <code>dims</code> (either an integer length or tuple shape) pointing to a dynamically-allocated piece of shared memory. The type should be statically inferable or an error will be thrown and the generator function will be called dynamically.</p><p>Note that the amount of dynamic shared memory needs to specified when launching the kernel.</p><p>Optionally, an offset parameter indicating how many bytes to add to the base shared memory pointer can be specified. This is useful when dealing with a heterogeneous buffer of dynamic shared memory; in the case of a homogeneous multi-part buffer it is preferred to use <code>view</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/shared_memory.jl#L30-L42">source</a></section></article><h3 id="Texture-memory"><a class="docs-heading-anchor" href="#Texture-memory">Texture memory</a><a id="Texture-memory-1"></a><a class="docs-heading-anchor-permalink" href="#Texture-memory" title="Permalink"></a></h3><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuDeviceTexture" href="#CUDA.CuDeviceTexture"><code>CUDA.CuDeviceTexture</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuDeviceTexture{T,N,M,NC,I}</code></pre><p><code>N</code>-dimensional device texture with elements of type <code>T</code>. This type is the device-side counterpart of <a href="../../lib/driver/#CUDA.CuTexture"><code>CuTexture{T,N,P}</code></a>, and can be used to access textures using regular indexing notation. If <code>NC</code> is true, indices used by these accesses should be normalized, i.e., fall into the <code>[0,1)</code> domain. The <code>I</code> type parameter indicates the kind of interpolation that happens when indexing into this texture. The source memory of the texture is specified by the <code>M</code> parameter, either linear memory or a texture array.</p><p>Device-side texture objects cannot be created directly, but should be created host-side using <a href="../../lib/driver/#CUDA.CuTexture"><code>CuTexture{T,N,P}</code></a> and passed to the kernel as an argument.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>Experimental API. Subject to change without deprecation.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/texture.jl#L12-L27">source</a></section></article><h2 id="Synchronization"><a class="docs-heading-anchor" href="#Synchronization">Synchronization</a><a id="Synchronization-1"></a><a class="docs-heading-anchor-permalink" href="#Synchronization" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.sync_threads" href="#CUDA.sync_threads"><code>CUDA.sync_threads</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">sync_threads()</code></pre><p>Waits until all threads in the thread block have reached this point and all global and shared memory accesses made by these threads prior to <code>sync_threads()</code> are visible to all threads in the block.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/synchronization.jl#L9-L15">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.sync_threads_count" href="#CUDA.sync_threads_count"><code>CUDA.sync_threads_count</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">sync_threads_count(predicate)</code></pre><p>Identical to <code>sync_threads()</code> with the additional feature that it evaluates predicate for all threads of the block and returns the number of threads for which <code>predicate</code> evaluates to true.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/synchronization.jl#L18-L24">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.sync_threads_and" href="#CUDA.sync_threads_and"><code>CUDA.sync_threads_and</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">sync_threads_and(predicate)</code></pre><p>Identical to <code>sync_threads()</code> with the additional feature that it evaluates predicate for all threads of the block and returns <code>true</code> if and only if <code>predicate</code> evaluates to <code>true</code> for all of them.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/synchronization.jl#L28-L34">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.sync_threads_or" href="#CUDA.sync_threads_or"><code>CUDA.sync_threads_or</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">sync_threads_or(predicate)</code></pre><p>Identical to <code>sync_threads()</code> with the additional feature that it evaluates predicate for all threads of the block and returns <code>true</code> if and only if <code>predicate</code> evaluates to <code>true</code> for any of them.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/synchronization.jl#L38-L44">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.sync_warp" href="#CUDA.sync_warp"><code>CUDA.sync_warp</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">sync_warp(mask::Integer=FULL_MASK)</code></pre><p>Waits threads in the warp, selected by means of the bitmask <code>mask</code>, have reached this point and all global and shared memory accesses made by these threads prior to <code>sync_warp()</code> are visible to those threads in the warp. The default value for <code>mask</code> selects all threads in the warp.</p><div class="admonition is-info"><header class="admonition-header">Note</header><div class="admonition-body"><p>Requires CUDA &gt;= 9.0 and sm_6.2</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/synchronization.jl#L48-L58">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.threadfence_block" href="#CUDA.threadfence_block"><code>CUDA.threadfence_block</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">threadfence_block()</code></pre><p>A memory fence that ensures that:</p><ul><li>All writes to all memory made by the calling thread before the call to <code>threadfence_block()</code> are observed by all threads in the block of the calling thread as occurring before all writes to all memory made by the calling thread after the call to <code>threadfence_block()</code></li><li>All reads from all memory made by the calling thread before the call to <code>threadfence_block()</code> are ordered before all reads from all memory made by the calling thread after the call to <code>threadfence_block()</code>.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/synchronization.jl#L74-L83">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.threadfence" href="#CUDA.threadfence"><code>CUDA.threadfence</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">threadfence()</code></pre><p>A memory fence that acts as <a href="#CUDA.threadfence_block"><code>threadfence_block</code></a> for all threads in the block of the calling thread and also ensures that no writes to all memory made by the calling thread after the call to <code>threadfence()</code> are observed by any thread in the device as occurring before any write to all memory made by the calling thread before the call to <code>threadfence()</code>.</p><p>Note that for this ordering guarantee to be true, the observing threads must truly observe the memory and not cached versions of it; this is requires the use of volatile loads and stores, which is not available from Julia right now.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/synchronization.jl#L86-L97">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.threadfence_system" href="#CUDA.threadfence_system"><code>CUDA.threadfence_system</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">threadfence_system()</code></pre><p>A memory fence that acts as <a href="#CUDA.threadfence_block"><code>threadfence_block</code></a> for all threads in the block of the calling thread and also ensures that all writes to all memory made by the calling thread before the call to <code>threadfence_system()</code> are observed by all threads in the device, host threads, and all threads in peer devices as occurring before all writes to all memory made by the calling thread after the call to <code>threadfence_system()</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/synchronization.jl#L100-L108">source</a></section></article><h2 id="Time-functions"><a class="docs-heading-anchor" href="#Time-functions">Time functions</a><a id="Time-functions-1"></a><a class="docs-heading-anchor-permalink" href="#Time-functions" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.clock" href="#CUDA.clock"><code>CUDA.clock</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">clock(UInt32)</code></pre><p>Returns the value of a per-multiprocessor counter that is incremented every clock cycle.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/misc.jl#L10-L14">source</a></section><section><div><pre><code class="language-julia hljs">clock(UInt64)</code></pre><p>Returns the value of a per-multiprocessor counter that is incremented every clock cycle.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/misc.jl#L17-L21">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.nanosleep" href="#CUDA.nanosleep"><code>CUDA.nanosleep</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">nanosleep(t)</code></pre><p>Puts a thread for a given amount <code>t</code>(in nanoseconds).</p><div class="admonition is-info"><header class="admonition-header">Note</header><div class="admonition-body"><p>Requires CUDA &gt;= 10.0 and sm_6.2</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/misc.jl#L25-L32">source</a></section></article><h2 id="Warp-level-functions"><a class="docs-heading-anchor" href="#Warp-level-functions">Warp-level functions</a><a id="Warp-level-functions-1"></a><a class="docs-heading-anchor-permalink" href="#Warp-level-functions" title="Permalink"></a></h2><h3 id="Voting"><a class="docs-heading-anchor" href="#Voting">Voting</a><a id="Voting-1"></a><a class="docs-heading-anchor-permalink" href="#Voting" title="Permalink"></a></h3><p>The warp vote functions allow the threads of a given warp to perform a reduction-and-broadcast operation. These functions take as input a boolean predicate from each thread in the warp and evaluate it. The results of that evaluation are combined (reduced) across the active threads of the warp in one different ways, broadcasting a single return value to each participating thread.</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.vote_all_sync" href="#CUDA.vote_all_sync"><code>CUDA.vote_all_sync</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">vote_all_sync(mask::UInt32, predicate::Bool)</code></pre><p>Evaluate <code>predicate</code> for all active threads of the warp and return whether <code>predicate</code> is true for all of them.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/warp.jl#L126-L131">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.vote_any_sync" href="#CUDA.vote_any_sync"><code>CUDA.vote_any_sync</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">vote_any_sync(mask::UInt32, predicate::Bool)</code></pre><p>Evaluate <code>predicate</code> for all active threads of the warp and return whether <code>predicate</code> is true for any of them.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/warp.jl#L134-L139">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.vote_uni_sync" href="#CUDA.vote_uni_sync"><code>CUDA.vote_uni_sync</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">vote_uni_sync(mask::UInt32, predicate::Bool)</code></pre><p>Evaluate <code>predicate</code> for all active threads of the warp and return whether <code>predicate</code> is the same for any of them.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/warp.jl#L142-L147">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.vote_ballot_sync" href="#CUDA.vote_ballot_sync"><code>CUDA.vote_ballot_sync</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">vote_ballot_sync(mask::UInt32, predicate::Bool)</code></pre><p>Evaluate <code>predicate</code> for all active threads of the warp and return an integer whose Nth bit is set if and only if <code>predicate</code> is true for the Nth thread of the warp and the Nth thread is active.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/warp.jl#L150-L156">source</a></section></article><h3 id="Shuffle"><a class="docs-heading-anchor" href="#Shuffle">Shuffle</a><a id="Shuffle-1"></a><a class="docs-heading-anchor-permalink" href="#Shuffle" title="Permalink"></a></h3><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.shfl_sync" href="#CUDA.shfl_sync"><code>CUDA.shfl_sync</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">shfl_sync(threadmask::UInt32, val, lane::Integer, width::Integer=32)</code></pre><p>Shuffle a value from a directly indexed lane <code>lane</code>, and synchronize threads according to <code>threadmask</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/warp.jl#L37-L42">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.shfl_up_sync" href="#CUDA.shfl_up_sync"><code>CUDA.shfl_up_sync</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">shfl_up_sync(threadmask::UInt32, val, delta::Integer, width::Integer=32)</code></pre><p>Shuffle a value from a lane with lower ID relative to caller, and synchronize threads according to <code>threadmask</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/warp.jl#L44-L49">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.shfl_down_sync" href="#CUDA.shfl_down_sync"><code>CUDA.shfl_down_sync</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">shfl_down_sync(threadmask::UInt32, val, delta::Integer, width::Integer=32)</code></pre><p>Shuffle a value from a lane with higher ID relative to caller, and synchronize threads according to <code>threadmask</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/warp.jl#L51-L56">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.shfl_xor_sync" href="#CUDA.shfl_xor_sync"><code>CUDA.shfl_xor_sync</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">shfl_xor_sync(threadmask::UInt32, val, mask::Integer, width::Integer=32)</code></pre><p>Shuffle a value from a lane based on bitwise XOR of own lane ID with <code>mask</code>, and synchronize threads according to <code>threadmask</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/warp.jl#L58-L63">source</a></section></article><h2 id="Formatted-Output"><a class="docs-heading-anchor" href="#Formatted-Output">Formatted Output</a><a id="Formatted-Output-1"></a><a class="docs-heading-anchor-permalink" href="#Formatted-Output" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.@cushow" href="#CUDA.@cushow"><code>CUDA.@cushow</code></a> — <span class="docstring-category">Macro</span></header><section><div><pre><code class="language-julia hljs">@cushow(ex)</code></pre><p>GPU analog of <code>Base.@show</code>. It comes with the same type restrictions as <a href="#CUDA.@cuprintf"><code>@cuprintf</code></a>.</p><pre><code class="language-julia hljs">@cushow threadIdx().x</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/output.jl#L238-L246">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.@cuprint" href="#CUDA.@cuprint"><code>CUDA.@cuprint</code></a> — <span class="docstring-category">Macro</span></header><section><div><pre><code class="language-julia hljs">@cuprint(xs...)
+</script><script data-outdated-warner src="../../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL="../.."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../../assets/documenter.js"></script><script src="../../search_index.js"></script><script src="../../siteinfo.js"></script><script src="../../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../../assets/themeswap.js"></script><link href="../../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../../"><img src="../../assets/logo.png" alt="CUDA.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../../">CUDA.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../../">Home</a></li><li><span class="tocitem">Tutorials</span><ul><li><a class="tocitem" href="../../tutorials/introduction/">Introduction</a></li><li><a class="tocitem" href="../../tutorials/custom_structs/">Using custom structs</a></li><li><a class="tocitem" href="../../tutorials/performance/">Performance Tips</a></li></ul></li><li><span class="tocitem">Installation</span><ul><li><a class="tocitem" href="../../installation/overview/">Overview</a></li><li><a class="tocitem" href="../../installation/conditional/">Conditional use</a></li><li><a class="tocitem" href="../../installation/troubleshooting/">Troubleshooting</a></li></ul></li><li><span class="tocitem">Usage</span><ul><li><a class="tocitem" href="../../usage/overview/">Overview</a></li><li><a class="tocitem" href="../../usage/workflow/">Workflow</a></li><li><a class="tocitem" href="../../usage/array/">Array programming</a></li><li><a class="tocitem" href="../../usage/memory/">Memory management</a></li><li><a class="tocitem" href="../../usage/multitasking/">Tasks and threads</a></li><li><a class="tocitem" href="../../usage/multigpu/">Multiple GPUs</a></li></ul></li><li><span class="tocitem">Development</span><ul><li><a class="tocitem" href="../../development/profiling/">Benchmarking &amp; profiling</a></li><li><a class="tocitem" href="../../development/kernel/">Kernel programming</a></li><li><a class="tocitem" href="../../development/troubleshooting/">Troubleshooting</a></li><li><a class="tocitem" href="../../development/debugging/">Debugging</a></li></ul></li><li><span class="tocitem">API reference</span><ul><li><a class="tocitem" href="../essentials/">Essentials</a></li><li><a class="tocitem" href="../array/">Array programming</a></li><li class="is-active"><a class="tocitem" href>Kernel programming</a><ul class="internal"><li><a class="tocitem" href="#Indexing-and-dimensions"><span>Indexing and dimensions</span></a></li><li><a class="tocitem" href="#Device-arrays"><span>Device arrays</span></a></li><li><a class="tocitem" href="#Memory-types"><span>Memory types</span></a></li><li><a class="tocitem" href="#Synchronization"><span>Synchronization</span></a></li><li><a class="tocitem" href="#Time-functions"><span>Time functions</span></a></li><li><a class="tocitem" href="#Warp-level-functions"><span>Warp-level functions</span></a></li><li><a class="tocitem" href="#Formatted-Output"><span>Formatted Output</span></a></li><li><a class="tocitem" href="#Assertions"><span>Assertions</span></a></li><li><a class="tocitem" href="#Atomics"><span>Atomics</span></a></li><li><a class="tocitem" href="#Dynamic-parallelism"><span>Dynamic parallelism</span></a></li><li><a class="tocitem" href="#Cooperative-groups"><span>Cooperative groups</span></a></li><li><a class="tocitem" href="#Data-transfer"><span>Data transfer</span></a></li><li><a class="tocitem" href="#Math"><span>Math</span></a></li><li><a class="tocitem" href="#WMMA"><span>WMMA</span></a></li><li><a class="tocitem" href="#Other"><span>Other</span></a></li></ul></li><li><a class="tocitem" href="../compiler/">Compiler</a></li></ul></li><li><span class="tocitem">Library reference</span><ul><li><a class="tocitem" href="../../lib/driver/">CUDA driver</a></li></ul></li><li><a class="tocitem" href="../../faq/">FAQ</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li><a class="is-disabled">API reference</a></li><li class="is-active"><a href>Kernel programming</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Kernel programming</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/CUDA.jl/blob/master/docs/src/api/kernel.md#" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="KernelAPI"><a class="docs-heading-anchor" href="#KernelAPI">Kernel programming</a><a id="KernelAPI-1"></a><a class="docs-heading-anchor-permalink" href="#KernelAPI" title="Permalink"></a></h1><p>This section lists the package&#39;s public functionality that corresponds to special CUDA functions for use in device code. It is loosely organized according to the <a href="http://docs.nvidia.com/cuda/cuda-c-programming-guide/#c-language-extensions">C language extensions</a> appendix from the CUDA C programming guide. For more information about certain intrinsics, refer to the aforementioned NVIDIA documentation.</p><h2 id="Indexing-and-dimensions"><a class="docs-heading-anchor" href="#Indexing-and-dimensions">Indexing and dimensions</a><a id="Indexing-and-dimensions-1"></a><a class="docs-heading-anchor-permalink" href="#Indexing-and-dimensions" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.gridDim" href="#CUDA.gridDim"><code>CUDA.gridDim</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">gridDim()::NamedTuple</code></pre><p>Returns the dimensions of the grid.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/indexing.jl#L66-L70">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.blockIdx" href="#CUDA.blockIdx"><code>CUDA.blockIdx</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">blockIdx()::NamedTuple</code></pre><p>Returns the block index within the grid.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/indexing.jl#L73-L77">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.blockDim" href="#CUDA.blockDim"><code>CUDA.blockDim</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">blockDim()::NamedTuple</code></pre><p>Returns the dimensions of the block.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/indexing.jl#L80-L84">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.threadIdx" href="#CUDA.threadIdx"><code>CUDA.threadIdx</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">threadIdx()::NamedTuple</code></pre><p>Returns the thread index within the block.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/indexing.jl#L87-L91">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.warpsize" href="#CUDA.warpsize"><code>CUDA.warpsize</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">warpsize(dev::CuDevice)</code></pre><p>Returns the warp size (in threads) of the device.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/devices.jl#L172-L176">source</a></section><section><div><pre><code class="language-julia hljs">warpsize()::Int32</code></pre><p>Returns the warp size (in threads).</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/indexing.jl#L94-L98">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.laneid" href="#CUDA.laneid"><code>CUDA.laneid</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">laneid()::Int32</code></pre><p>Returns the thread&#39;s lane within the warp.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/indexing.jl#L101-L105">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.active_mask" href="#CUDA.active_mask"><code>CUDA.active_mask</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">active_mask()</code></pre><p>Returns a 32-bit mask indicating which threads in a warp are active with the current executing thread.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/indexing.jl#L130-L135">source</a></section></article><h2 id="Device-arrays"><a class="docs-heading-anchor" href="#Device-arrays">Device arrays</a><a id="Device-arrays-1"></a><a class="docs-heading-anchor-permalink" href="#Device-arrays" title="Permalink"></a></h2><p>CUDA.jl provides a primitive, lightweight array type to manage GPU data organized in an plain, dense fashion. This is the device-counterpart to the <code>CuArray</code>, and implements (part of) the array interface as well as other functionality for use <em>on</em> the GPU:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuDeviceArray" href="#CUDA.CuDeviceArray"><code>CUDA.CuDeviceArray</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuDeviceArray{T,N,A}(ptr, dims, [maxsize])</code></pre><p>Construct an <code>N</code>-dimensional dense CUDA device array with element type <code>T</code> wrapping a pointer, where <code>N</code> is determined from the length of <code>dims</code> and <code>T</code> is determined from the type of <code>ptr</code>. <code>dims</code> may be a single scalar, or a tuple of integers corresponding to the lengths in each dimension). If the rank <code>N</code> is supplied explicitly as in <code>Array{T,N}(dims)</code>, then it must match the length of <code>dims</code>. The same applies to the element type <code>T</code>, which should match the type of the pointer <code>ptr</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/array.jl#L8-L17">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.Const" href="#CUDA.Const"><code>CUDA.Const</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">Const(A::CuDeviceArray)</code></pre><p>Mark a CuDeviceArray as constant/read-only. The invariant guaranteed is that you will not modify an CuDeviceArray for the duration of the current kernel.</p><p>This API can only be used on devices with compute capability 3.5 or higher.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>Experimental API. Subject to change without deprecation.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/array.jl#L185-L195">source</a></section></article><h2 id="Memory-types"><a class="docs-heading-anchor" href="#Memory-types">Memory types</a><a id="Memory-types-1"></a><a class="docs-heading-anchor-permalink" href="#Memory-types" title="Permalink"></a></h2><h3 id="Shared-memory"><a class="docs-heading-anchor" href="#Shared-memory">Shared memory</a><a id="Shared-memory-1"></a><a class="docs-heading-anchor-permalink" href="#Shared-memory" title="Permalink"></a></h3><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuStaticSharedArray" href="#CUDA.CuStaticSharedArray"><code>CUDA.CuStaticSharedArray</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">CuStaticSharedArray(T::Type, dims) -&gt; CuDeviceArray{T,N,AS.Shared}</code></pre><p>Get an array of type <code>T</code> and dimensions <code>dims</code> (either an integer length or tuple shape) pointing to a statically-allocated piece of shared memory. The type should be statically inferable and the dimensions should be constant, or an error will be thrown and the generator function will be called dynamically.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/shared_memory.jl#L5-L12">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuDynamicSharedArray" href="#CUDA.CuDynamicSharedArray"><code>CUDA.CuDynamicSharedArray</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">CuDynamicSharedArray(T::Type, dims, offset::Integer=0) -&gt; CuDeviceArray{T,N,AS.Shared}</code></pre><p>Get an array of type <code>T</code> and dimensions <code>dims</code> (either an integer length or tuple shape) pointing to a dynamically-allocated piece of shared memory. The type should be statically inferable or an error will be thrown and the generator function will be called dynamically.</p><p>Note that the amount of dynamic shared memory needs to specified when launching the kernel.</p><p>Optionally, an offset parameter indicating how many bytes to add to the base shared memory pointer can be specified. This is useful when dealing with a heterogeneous buffer of dynamic shared memory; in the case of a homogeneous multi-part buffer it is preferred to use <code>view</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/shared_memory.jl#L30-L42">source</a></section></article><h3 id="Texture-memory"><a class="docs-heading-anchor" href="#Texture-memory">Texture memory</a><a id="Texture-memory-1"></a><a class="docs-heading-anchor-permalink" href="#Texture-memory" title="Permalink"></a></h3><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuDeviceTexture" href="#CUDA.CuDeviceTexture"><code>CUDA.CuDeviceTexture</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuDeviceTexture{T,N,M,NC,I}</code></pre><p><code>N</code>-dimensional device texture with elements of type <code>T</code>. This type is the device-side counterpart of <a href="../../lib/driver/#CUDA.CuTexture"><code>CuTexture{T,N,P}</code></a>, and can be used to access textures using regular indexing notation. If <code>NC</code> is true, indices used by these accesses should be normalized, i.e., fall into the <code>[0,1)</code> domain. The <code>I</code> type parameter indicates the kind of interpolation that happens when indexing into this texture. The source memory of the texture is specified by the <code>M</code> parameter, either linear memory or a texture array.</p><p>Device-side texture objects cannot be created directly, but should be created host-side using <a href="../../lib/driver/#CUDA.CuTexture"><code>CuTexture{T,N,P}</code></a> and passed to the kernel as an argument.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>Experimental API. Subject to change without deprecation.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/texture.jl#L12-L27">source</a></section></article><h2 id="Synchronization"><a class="docs-heading-anchor" href="#Synchronization">Synchronization</a><a id="Synchronization-1"></a><a class="docs-heading-anchor-permalink" href="#Synchronization" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.sync_threads" href="#CUDA.sync_threads"><code>CUDA.sync_threads</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">sync_threads()</code></pre><p>Waits until all threads in the thread block have reached this point and all global and shared memory accesses made by these threads prior to <code>sync_threads()</code> are visible to all threads in the block.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/synchronization.jl#L9-L15">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.sync_threads_count" href="#CUDA.sync_threads_count"><code>CUDA.sync_threads_count</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">sync_threads_count(predicate)</code></pre><p>Identical to <code>sync_threads()</code> with the additional feature that it evaluates predicate for all threads of the block and returns the number of threads for which <code>predicate</code> evaluates to true.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/synchronization.jl#L18-L24">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.sync_threads_and" href="#CUDA.sync_threads_and"><code>CUDA.sync_threads_and</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">sync_threads_and(predicate)</code></pre><p>Identical to <code>sync_threads()</code> with the additional feature that it evaluates predicate for all threads of the block and returns <code>true</code> if and only if <code>predicate</code> evaluates to <code>true</code> for all of them.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/synchronization.jl#L28-L34">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.sync_threads_or" href="#CUDA.sync_threads_or"><code>CUDA.sync_threads_or</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">sync_threads_or(predicate)</code></pre><p>Identical to <code>sync_threads()</code> with the additional feature that it evaluates predicate for all threads of the block and returns <code>true</code> if and only if <code>predicate</code> evaluates to <code>true</code> for any of them.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/synchronization.jl#L38-L44">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.sync_warp" href="#CUDA.sync_warp"><code>CUDA.sync_warp</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">sync_warp(mask::Integer=FULL_MASK)</code></pre><p>Waits threads in the warp, selected by means of the bitmask <code>mask</code>, have reached this point and all global and shared memory accesses made by these threads prior to <code>sync_warp()</code> are visible to those threads in the warp. The default value for <code>mask</code> selects all threads in the warp.</p><div class="admonition is-info"><header class="admonition-header">Note</header><div class="admonition-body"><p>Requires CUDA &gt;= 9.0 and sm_6.2</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/synchronization.jl#L48-L58">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.threadfence_block" href="#CUDA.threadfence_block"><code>CUDA.threadfence_block</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">threadfence_block()</code></pre><p>A memory fence that ensures that:</p><ul><li>All writes to all memory made by the calling thread before the call to <code>threadfence_block()</code> are observed by all threads in the block of the calling thread as occurring before all writes to all memory made by the calling thread after the call to <code>threadfence_block()</code></li><li>All reads from all memory made by the calling thread before the call to <code>threadfence_block()</code> are ordered before all reads from all memory made by the calling thread after the call to <code>threadfence_block()</code>.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/synchronization.jl#L74-L83">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.threadfence" href="#CUDA.threadfence"><code>CUDA.threadfence</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">threadfence()</code></pre><p>A memory fence that acts as <a href="#CUDA.threadfence_block"><code>threadfence_block</code></a> for all threads in the block of the calling thread and also ensures that no writes to all memory made by the calling thread after the call to <code>threadfence()</code> are observed by any thread in the device as occurring before any write to all memory made by the calling thread before the call to <code>threadfence()</code>.</p><p>Note that for this ordering guarantee to be true, the observing threads must truly observe the memory and not cached versions of it; this is requires the use of volatile loads and stores, which is not available from Julia right now.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/synchronization.jl#L86-L97">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.threadfence_system" href="#CUDA.threadfence_system"><code>CUDA.threadfence_system</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">threadfence_system()</code></pre><p>A memory fence that acts as <a href="#CUDA.threadfence_block"><code>threadfence_block</code></a> for all threads in the block of the calling thread and also ensures that all writes to all memory made by the calling thread before the call to <code>threadfence_system()</code> are observed by all threads in the device, host threads, and all threads in peer devices as occurring before all writes to all memory made by the calling thread after the call to <code>threadfence_system()</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/synchronization.jl#L100-L108">source</a></section></article><h2 id="Time-functions"><a class="docs-heading-anchor" href="#Time-functions">Time functions</a><a id="Time-functions-1"></a><a class="docs-heading-anchor-permalink" href="#Time-functions" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.clock" href="#CUDA.clock"><code>CUDA.clock</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">clock(UInt32)</code></pre><p>Returns the value of a per-multiprocessor counter that is incremented every clock cycle.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/misc.jl#L10-L14">source</a></section><section><div><pre><code class="language-julia hljs">clock(UInt64)</code></pre><p>Returns the value of a per-multiprocessor counter that is incremented every clock cycle.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/misc.jl#L17-L21">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.nanosleep" href="#CUDA.nanosleep"><code>CUDA.nanosleep</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">nanosleep(t)</code></pre><p>Puts a thread for a given amount <code>t</code>(in nanoseconds).</p><div class="admonition is-info"><header class="admonition-header">Note</header><div class="admonition-body"><p>Requires CUDA &gt;= 10.0 and sm_6.2</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/misc.jl#L25-L32">source</a></section></article><h2 id="Warp-level-functions"><a class="docs-heading-anchor" href="#Warp-level-functions">Warp-level functions</a><a id="Warp-level-functions-1"></a><a class="docs-heading-anchor-permalink" href="#Warp-level-functions" title="Permalink"></a></h2><h3 id="Voting"><a class="docs-heading-anchor" href="#Voting">Voting</a><a id="Voting-1"></a><a class="docs-heading-anchor-permalink" href="#Voting" title="Permalink"></a></h3><p>The warp vote functions allow the threads of a given warp to perform a reduction-and-broadcast operation. These functions take as input a boolean predicate from each thread in the warp and evaluate it. The results of that evaluation are combined (reduced) across the active threads of the warp in one different ways, broadcasting a single return value to each participating thread.</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.vote_all_sync" href="#CUDA.vote_all_sync"><code>CUDA.vote_all_sync</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">vote_all_sync(mask::UInt32, predicate::Bool)</code></pre><p>Evaluate <code>predicate</code> for all active threads of the warp and return whether <code>predicate</code> is true for all of them.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/warp.jl#L126-L131">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.vote_any_sync" href="#CUDA.vote_any_sync"><code>CUDA.vote_any_sync</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">vote_any_sync(mask::UInt32, predicate::Bool)</code></pre><p>Evaluate <code>predicate</code> for all active threads of the warp and return whether <code>predicate</code> is true for any of them.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/warp.jl#L134-L139">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.vote_uni_sync" href="#CUDA.vote_uni_sync"><code>CUDA.vote_uni_sync</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">vote_uni_sync(mask::UInt32, predicate::Bool)</code></pre><p>Evaluate <code>predicate</code> for all active threads of the warp and return whether <code>predicate</code> is the same for any of them.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/warp.jl#L142-L147">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.vote_ballot_sync" href="#CUDA.vote_ballot_sync"><code>CUDA.vote_ballot_sync</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">vote_ballot_sync(mask::UInt32, predicate::Bool)</code></pre><p>Evaluate <code>predicate</code> for all active threads of the warp and return an integer whose Nth bit is set if and only if <code>predicate</code> is true for the Nth thread of the warp and the Nth thread is active.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/warp.jl#L150-L156">source</a></section></article><h3 id="Shuffle"><a class="docs-heading-anchor" href="#Shuffle">Shuffle</a><a id="Shuffle-1"></a><a class="docs-heading-anchor-permalink" href="#Shuffle" title="Permalink"></a></h3><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.shfl_sync" href="#CUDA.shfl_sync"><code>CUDA.shfl_sync</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">shfl_sync(threadmask::UInt32, val, lane::Integer, width::Integer=32)</code></pre><p>Shuffle a value from a directly indexed lane <code>lane</code>, and synchronize threads according to <code>threadmask</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/warp.jl#L37-L42">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.shfl_up_sync" href="#CUDA.shfl_up_sync"><code>CUDA.shfl_up_sync</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">shfl_up_sync(threadmask::UInt32, val, delta::Integer, width::Integer=32)</code></pre><p>Shuffle a value from a lane with lower ID relative to caller, and synchronize threads according to <code>threadmask</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/warp.jl#L44-L49">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.shfl_down_sync" href="#CUDA.shfl_down_sync"><code>CUDA.shfl_down_sync</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">shfl_down_sync(threadmask::UInt32, val, delta::Integer, width::Integer=32)</code></pre><p>Shuffle a value from a lane with higher ID relative to caller, and synchronize threads according to <code>threadmask</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/warp.jl#L51-L56">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.shfl_xor_sync" href="#CUDA.shfl_xor_sync"><code>CUDA.shfl_xor_sync</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">shfl_xor_sync(threadmask::UInt32, val, mask::Integer, width::Integer=32)</code></pre><p>Shuffle a value from a lane based on bitwise XOR of own lane ID with <code>mask</code>, and synchronize threads according to <code>threadmask</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/warp.jl#L58-L63">source</a></section></article><h2 id="Formatted-Output"><a class="docs-heading-anchor" href="#Formatted-Output">Formatted Output</a><a id="Formatted-Output-1"></a><a class="docs-heading-anchor-permalink" href="#Formatted-Output" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.@cushow" href="#CUDA.@cushow"><code>CUDA.@cushow</code></a> — <span class="docstring-category">Macro</span></header><section><div><pre><code class="language-julia hljs">@cushow(ex)</code></pre><p>GPU analog of <code>Base.@show</code>. It comes with the same type restrictions as <a href="#CUDA.@cuprintf"><code>@cuprintf</code></a>.</p><pre><code class="language-julia hljs">@cushow threadIdx().x</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/output.jl#L238-L246">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.@cuprint" href="#CUDA.@cuprint"><code>CUDA.@cuprint</code></a> — <span class="docstring-category">Macro</span></header><section><div><pre><code class="language-julia hljs">@cuprint(xs...)
 @cuprintln(xs...)</code></pre><p>Print a textual representation of values <code>xs</code> to standard output from the GPU. The functionality builds on <code>@cuprintf</code>, and is intended as a more use friendly alternative of that API. However, that also means there&#39;s only limited support for argument types, handling 16/32/64 signed and unsigned integers, 32 and 64-bit floating point numbers, <code>Cchar</code>s and pointers. For more complex output, use <code>@cuprintf</code> directly.</p><p>Limited string interpolation is also possible:</p><pre><code class="language-julia hljs">    @cuprint(&quot;Hello, World &quot;, 42, &quot;\n&quot;)
-    @cuprint &quot;Hello, World $(42)\n&quot;</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/output.jl#L180-L196">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.@cuprintln" href="#CUDA.@cuprintln"><code>CUDA.@cuprintln</code></a> — <span class="docstring-category">Macro</span></header><section><div><pre><code class="language-julia hljs">@cuprint(xs...)
+    @cuprint &quot;Hello, World $(42)\n&quot;</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/output.jl#L180-L196">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.@cuprintln" href="#CUDA.@cuprintln"><code>CUDA.@cuprintln</code></a> — <span class="docstring-category">Macro</span></header><section><div><pre><code class="language-julia hljs">@cuprint(xs...)
 @cuprintln(xs...)</code></pre><p>Print a textual representation of values <code>xs</code> to standard output from the GPU. The functionality builds on <code>@cuprintf</code>, and is intended as a more use friendly alternative of that API. However, that also means there&#39;s only limited support for argument types, handling 16/32/64 signed and unsigned integers, 32 and 64-bit floating point numbers, <code>Cchar</code>s and pointers. For more complex output, use <code>@cuprintf</code> directly.</p><p>Limited string interpolation is also possible:</p><pre><code class="language-julia hljs">    @cuprint(&quot;Hello, World &quot;, 42, &quot;\n&quot;)
-    @cuprint &quot;Hello, World $(42)\n&quot;</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/output.jl#L229">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.@cuprintf" href="#CUDA.@cuprintf"><code>CUDA.@cuprintf</code></a> — <span class="docstring-category">Macro</span></header><section><div><pre><code class="language-julia hljs">@cuprintf(&quot;%Fmt&quot;, args...)</code></pre><p>Print a formatted string in device context on the host standard output.</p><p>Note that this is not a fully C-compliant <code>printf</code> implementation; see the CUDA documentation for supported options and inputs.</p><p>Also beware that it is an untyped, and unforgiving <code>printf</code> implementation. Type widths need to match, eg. printing a 64-bit Julia integer requires the <code>%ld</code> formatting string.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/output.jl#L21-L31">source</a></section></article><h2 id="Assertions"><a class="docs-heading-anchor" href="#Assertions">Assertions</a><a id="Assertions-1"></a><a class="docs-heading-anchor-permalink" href="#Assertions" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.@cuassert" href="#CUDA.@cuassert"><code>CUDA.@cuassert</code></a> — <span class="docstring-category">Macro</span></header><section><div><pre><code class="language-julia hljs">@assert cond [text]</code></pre><p>Signal assertion failure to the CUDA driver if <code>cond</code> is <code>false</code>. Preferred syntax for writing assertions, mimicking <code>Base.@assert</code>. Message <code>text</code> is optionally displayed upon assertion failure.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>A failed assertion will crash the GPU, so use sparingly as a debugging tool. Furthermore, the assertion might be disabled at various optimization levels, and thus should not cause any side-effects.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/assertion.jl#L5-L16">source</a></section></article><h2 id="Atomics"><a class="docs-heading-anchor" href="#Atomics">Atomics</a><a id="Atomics-1"></a><a class="docs-heading-anchor-permalink" href="#Atomics" title="Permalink"></a></h2><p>A high-level macro is available to annotate expressions with:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.@atomic" href="#CUDA.@atomic"><code>CUDA.@atomic</code></a> — <span class="docstring-category">Macro</span></header><section><div><pre><code class="language-julia hljs">@atomic a[I] = op(a[I], val)
-@atomic a[I] ...= val</code></pre><p>Atomically perform a sequence of operations that loads an array element <code>a[I]</code>, performs the operation <code>op</code> on that value and a second value <code>val</code>, and writes the result back to the array. This sequence can be written out as a regular assignment, in which case the same array element should be used in the left and right hand side of the assignment, or as an in-place application of a known operator. In both cases, the array reference should be pure and not induce any side-effects.</p><div class="admonition is-category-warn"><header class="admonition-header">Warn</header><div class="admonition-body"><p>This interface is experimental, and might change without warning.  Use the lower-level <code>atomic_...!</code> functions for a stable API, albeit one limited to natively-supported ops.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/atomics.jl#L395-L409">source</a></section></article><p>If your expression is not recognized, or you need more control, use the underlying functions:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.atomic_cas!" href="#CUDA.atomic_cas!"><code>CUDA.atomic_cas!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">atomic_cas!(ptr::LLVMPtr{T}, cmp::T, val::T)</code></pre><p>Reads the value <code>old</code> located at address <code>ptr</code> and compare with <code>cmp</code>. If <code>old</code> equals to <code>cmp</code>, stores <code>val</code> at the same address. Otherwise, doesn&#39;t change the value <code>old</code>. These operations are performed in one atomic transaction. The function returns <code>old</code>.</p><p>This operation is supported for values of type Int32, Int64, UInt32 and UInt64. Additionally, on GPU hardware with compute capability 7.0+, values of type UInt16 are supported.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/atomics.jl#L236-L246">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.atomic_xchg!" href="#CUDA.atomic_xchg!"><code>CUDA.atomic_xchg!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">atomic_xchg!(ptr::LLVMPtr{T}, val::T)</code></pre><p>Reads the value <code>old</code> located at address <code>ptr</code> and stores <code>val</code> at the same address. These operations are performed in one atomic transaction. The function returns <code>old</code>.</p><p>This operation is supported for values of type Int32, Int64, UInt32 and UInt64.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/atomics.jl#L249-L256">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.atomic_add!" href="#CUDA.atomic_add!"><code>CUDA.atomic_add!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">atomic_add!(ptr::LLVMPtr{T}, val::T)</code></pre><p>Reads the value <code>old</code> located at address <code>ptr</code>, computes <code>old + val</code>, and stores the result back to memory at the same address. These operations are performed in one atomic transaction. The function returns <code>old</code>.</p><p>This operation is supported for values of type Int32, Int64, UInt32, UInt64, and Float32. Additionally, on GPU hardware with compute capability 6.0+, values of type Float64 are supported.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/atomics.jl#L259-L269">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.atomic_sub!" href="#CUDA.atomic_sub!"><code>CUDA.atomic_sub!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">atomic_sub!(ptr::LLVMPtr{T}, val::T)</code></pre><p>Reads the value <code>old</code> located at address <code>ptr</code>, computes <code>old - val</code>, and stores the result back to memory at the same address. These operations are performed in one atomic transaction. The function returns <code>old</code>.</p><p>This operation is supported for values of type Int32, Int64, UInt32 and UInt64.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/atomics.jl#L272-L280">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.atomic_and!" href="#CUDA.atomic_and!"><code>CUDA.atomic_and!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">atomic_and!(ptr::LLVMPtr{T}, val::T)</code></pre><p>Reads the value <code>old</code> located at address <code>ptr</code>, computes <code>old &amp; val</code>, and stores the result back to memory at the same address. These operations are performed in one atomic transaction. The function returns <code>old</code>.</p><p>This operation is supported for values of type Int32, Int64, UInt32 and UInt64.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/atomics.jl#L283-L291">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.atomic_or!" href="#CUDA.atomic_or!"><code>CUDA.atomic_or!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">atomic_or!(ptr::LLVMPtr{T}, val::T)</code></pre><p>Reads the value <code>old</code> located at address <code>ptr</code>, computes <code>old | val</code>, and stores the result back to memory at the same address. These operations are performed in one atomic transaction. The function returns <code>old</code>.</p><p>This operation is supported for values of type Int32, Int64, UInt32 and UInt64.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/atomics.jl#L294-L302">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.atomic_xor!" href="#CUDA.atomic_xor!"><code>CUDA.atomic_xor!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">atomic_xor!(ptr::LLVMPtr{T}, val::T)</code></pre><p>Reads the value <code>old</code> located at address <code>ptr</code>, computes <code>old ⊻ val</code>, and stores the result back to memory at the same address. These operations are performed in one atomic transaction. The function returns <code>old</code>.</p><p>This operation is supported for values of type Int32, Int64, UInt32 and UInt64.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/atomics.jl#L305-L313">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.atomic_min!" href="#CUDA.atomic_min!"><code>CUDA.atomic_min!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">atomic_min!(ptr::LLVMPtr{T}, val::T)</code></pre><p>Reads the value <code>old</code> located at address <code>ptr</code>, computes <code>min(old, val)</code>, and stores the result back to memory at the same address. These operations are performed in one atomic transaction. The function returns <code>old</code>.</p><p>This operation is supported for values of type Int32, Int64, UInt32 and UInt64.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/atomics.jl#L316-L324">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.atomic_max!" href="#CUDA.atomic_max!"><code>CUDA.atomic_max!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">atomic_max!(ptr::LLVMPtr{T}, val::T)</code></pre><p>Reads the value <code>old</code> located at address <code>ptr</code>, computes <code>max(old, val)</code>, and stores the result back to memory at the same address. These operations are performed in one atomic transaction. The function returns <code>old</code>.</p><p>This operation is supported for values of type Int32, Int64, UInt32 and UInt64.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/atomics.jl#L327-L335">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.atomic_inc!" href="#CUDA.atomic_inc!"><code>CUDA.atomic_inc!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">atomic_inc!(ptr::LLVMPtr{T}, val::T)</code></pre><p>Reads the value <code>old</code> located at address <code>ptr</code>, computes <code>((old &gt;= val) ? 0 : (old+1))</code>, and stores the result back to memory at the same address. These three operations are performed in one atomic transaction. The function returns <code>old</code>.</p><p>This operation is only supported for values of type Int32.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/atomics.jl#L338-L346">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.atomic_dec!" href="#CUDA.atomic_dec!"><code>CUDA.atomic_dec!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">atomic_dec!(ptr::LLVMPtr{T}, val::T)</code></pre><p>Reads the value <code>old</code> located at address <code>ptr</code>, computes <code>(((old == 0) | (old &gt; val)) ? val : (old-1) )</code>, and stores the result back to memory at the same address. These three operations are performed in one atomic transaction. The function returns <code>old</code>.</p><p>This operation is only supported for values of type Int32.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/atomics.jl#L349-L357">source</a></section></article><h2 id="Dynamic-parallelism"><a class="docs-heading-anchor" href="#Dynamic-parallelism">Dynamic parallelism</a><a id="Dynamic-parallelism-1"></a><a class="docs-heading-anchor-permalink" href="#Dynamic-parallelism" title="Permalink"></a></h2><p>Similarly to launching kernels from the host, you can use <code>@cuda</code> while passing <code>dynamic=true</code> for launching kernels from the device. A lower-level API is available as well:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.dynamic_cufunction" href="#CUDA.dynamic_cufunction"><code>CUDA.dynamic_cufunction</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">dynamic_cufunction(f, tt=Tuple{})</code></pre><p>Low-level interface to compile a function invocation for the currently-active GPU, returning a callable kernel object. Device-side equivalent of <a href="../compiler/#CUDA.cufunction"><code>CUDA.cufunction</code></a>.</p><p>No keyword arguments are supported.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/compiler/execution.jl#L409-L416">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.DeviceKernel" href="#CUDA.DeviceKernel"><code>CUDA.DeviceKernel</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">(::HostKernel)(args...; kwargs...)
-(::DeviceKernel)(args...; kwargs...)</code></pre><p>Low-level interface to call a compiled kernel, passing GPU-compatible arguments in <code>args</code>. For a higher-level interface, use <a href="../compiler/#CUDA.@cuda"><code>@cuda</code></a>.</p><p>A <code>HostKernel</code> is callable on the host, and a <code>DeviceKernel</code> is callable on the device (created by <code>@cuda</code> with <code>dynamic=true</code>).</p><p>The following keyword arguments are supported:</p><ul><li><code>threads</code> (default: <code>1</code>): Number of threads per block, or a 1-, 2- or 3-tuple of dimensions (e.g. <code>threads=(32, 32)</code> for a 2D block of 32×32 threads). Use <a href="#CUDA.threadIdx"><code>threadIdx()</code></a> and <a href="#CUDA.blockDim"><code>blockDim()</code></a> to query from within the kernel.</li><li><code>blocks</code> (default: <code>1</code>): Number of thread blocks to launch, or a 1-, 2- or 3-tuple of dimensions (e.g. <code>blocks=(2, 4, 2)</code> for a 3D grid of blocks). Use <a href="#CUDA.blockIdx"><code>blockIdx()</code></a> and <a href="#CUDA.gridDim"><code>gridDim()</code></a> to query from within the kernel.</li><li><code>shmem</code>(default: <code>0</code>): Amount of dynamic shared memory in bytes to allocate per thread block; used by <a href="#CUDA.CuDynamicSharedArray"><code>CuDynamicSharedArray</code></a>.</li><li><code>stream</code> (default: <a href="../essentials/#CUDA.stream"><code>stream()</code></a>): <a href="../../lib/driver/#CUDA.CuStream"><code>CuStream</code></a> to launch the kernel on.</li><li><code>cooperative</code> (default: <code>false</code>): whether to launch a cooperative kernel that supports grid synchronization (see <a href="#CUDA.CG.this_grid"><code>CG.this_grid</code></a> and <a href="#CUDA.CG.sync"><code>CG.sync</code></a>). Note that this requires care wrt. the number of blocks launched.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/compiler/execution.jl#L404">source</a></section></article><h2 id="Cooperative-groups"><a class="docs-heading-anchor" href="#Cooperative-groups">Cooperative groups</a><a id="Cooperative-groups-1"></a><a class="docs-heading-anchor-permalink" href="#Cooperative-groups" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG" href="#CUDA.CG"><code>CUDA.CG</code></a> — <span class="docstring-category">Module</span></header><section><div><p>CUDA.jl&#39;s cooperative groups implementation.</p><p>Cooperative groups in CUDA offer a structured approach to synchronize and communicate among threads. They allow developers to define specific groups of threads, providing a means to fine-tune inter-thread communication granularity. By offering a more nuanced alternative to traditional CUDA synchronization methods, cooperative groups enable a more controlled and efficient parallel decomposition in kernel design.</p><p>The following functionality is available in CUDA.jl:</p><ul><li>implicit groups: thread blocks, grid groups, and coalesced groups.</li><li>synchronization: <code>sync</code>, <code>barrier_arrive</code>, <code>barrier_wait</code></li><li>warp collectives for coalesced groups: shuffle and voting</li><li>data transfer: <code>memcpy_async</code>, <code>wait</code> and <code>wait_prior</code></li></ul><p>Noteworthy missing functionality:</p><ul><li>implicit groups: clusters, and multi-grid groups (which are deprecated)</li><li>explicit groups: tiling and partitioning</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/cooperative_groups.jl#L3-L23">source</a></section></article><h3 id="Group-construction-and-properties"><a class="docs-heading-anchor" href="#Group-construction-and-properties">Group construction and properties</a><a id="Group-construction-and-properties-1"></a><a class="docs-heading-anchor-permalink" href="#Group-construction-and-properties" title="Permalink"></a></h3><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.thread_rank" href="#CUDA.CG.thread_rank"><code>CUDA.CG.thread_rank</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">thread_rank(group)</code></pre><p>Returns the linearized rank of the calling thread along the interval <code>[1, num_threads()]</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/cooperative_groups.jl#L92-L96">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.num_threads" href="#CUDA.CG.num_threads"><code>CUDA.CG.num_threads</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">num_threads(group)</code></pre><p>Returns the total number of threads in the group.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/cooperative_groups.jl#L99-L103">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.thread_block" href="#CUDA.CG.thread_block"><code>CUDA.CG.thread_block</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">thread_block &lt;: thread_group</code></pre><p>Every GPU kernel is executed by a grid of thread blocks, and threads within each block are guaranteed to reside on the same streaming multiprocessor. A <code>thread_block</code> represents a thread block whose dimensions are not known until runtime.</p><p>Constructed via <a href="#CUDA.CG.this_thread_block"><code>this_thread_block</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/cooperative_groups.jl#L111-L119">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.this_thread_block" href="#CUDA.CG.this_thread_block"><code>CUDA.CG.this_thread_block</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">this_thread_block()</code></pre><p>Constructs a <code>thread_block</code> group</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/cooperative_groups.jl#L123-L127">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.group_index" href="#CUDA.CG.group_index"><code>CUDA.CG.group_index</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">group_index(tb::thread_block)</code></pre><p>3-Dimensional index of the block within the launched grid.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/cooperative_groups.jl#L132-L136">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.thread_index" href="#CUDA.CG.thread_index"><code>CUDA.CG.thread_index</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">thread_index(tb::thread_block)</code></pre><p>3-Dimensional index of the thread within the launched block.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/cooperative_groups.jl#L139-L143">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.dim_threads" href="#CUDA.CG.dim_threads"><code>CUDA.CG.dim_threads</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">dim_threads(tb::thread_block)</code></pre><p>Dimensions of the launched block in units of threads.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/cooperative_groups.jl#L146-L150">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.grid_group" href="#CUDA.CG.grid_group"><code>CUDA.CG.grid_group</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">grid_group &lt;: thread_group</code></pre><p>Threads within this this group are guaranteed to be co-resident on the same device within the same launched kernel. To use this group, the kernel must have been launched with <code>@cuda cooperative=true</code>, and the device must support it (queryable device attribute).</p><p>Constructed via <a href="#CUDA.CG.this_grid"><code>this_grid</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/cooperative_groups.jl#L161-L169">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.this_grid" href="#CUDA.CG.this_grid"><code>CUDA.CG.this_grid</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">this_grid()</code></pre><p>Constructs a <code>grid_group</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/cooperative_groups.jl#L174-L178">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.is_valid" href="#CUDA.CG.is_valid"><code>CUDA.CG.is_valid</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">is_valid(gg::grid_group)</code></pre><p>Returns whether the grid_group can synchronize</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/cooperative_groups.jl#L189-L193">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.block_rank" href="#CUDA.CG.block_rank"><code>CUDA.CG.block_rank</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">block_rank(gg::grid_group)</code></pre><p>Rank of the calling block within [0, num_blocks)</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/cooperative_groups.jl#L201-L205">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.num_blocks" href="#CUDA.CG.num_blocks"><code>CUDA.CG.num_blocks</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">num_blocks(gg::grid_group)</code></pre><p>Total number of blocks in the group.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/cooperative_groups.jl#L213-L217">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.dim_blocks" href="#CUDA.CG.dim_blocks"><code>CUDA.CG.dim_blocks</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">dim_blocks(gg::grid_group)</code></pre><p>Dimensions of the launched grid in units of blocks.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/cooperative_groups.jl#L223-L227">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.block_index" href="#CUDA.CG.block_index"><code>CUDA.CG.block_index</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">block_index(gg::grid_group)</code></pre><p>3-Dimensional index of the block within the launched grid.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/cooperative_groups.jl#L230-L234">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.coalesced_group" href="#CUDA.CG.coalesced_group"><code>CUDA.CG.coalesced_group</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">coalesced_group &lt;: thread_group</code></pre><p>A group representing the current set of converged threads in a warp. The size of the group is not guaranteed and it may return a group of only one thread (itself).</p><p>This group exposes warp-synchronous builtins. Constructed via <a href="#CUDA.CG.coalesced_threads"><code>coalesced_threads</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/cooperative_groups.jl#L242-L249">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.coalesced_threads" href="#CUDA.CG.coalesced_threads"><code>CUDA.CG.coalesced_threads</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">coalesced_threads()</code></pre><p>Constructs a <code>coalesced_group</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/cooperative_groups.jl#L259-L263">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.meta_group_rank" href="#CUDA.CG.meta_group_rank"><code>CUDA.CG.meta_group_rank</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">meta_group_rank(cg::coalesced_group)</code></pre><p>Rank of this group in the upper level of the hierarchy.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/cooperative_groups.jl#L270-L274">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.meta_group_size" href="#CUDA.CG.meta_group_size"><code>CUDA.CG.meta_group_size</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">meta_group_size(cg::coalesced_group)</code></pre><p>Total number of partitions created out of all CTAs when the group was created.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/cooperative_groups.jl#L277-L281">source</a></section></article><h3 id="Synchronization-2"><a class="docs-heading-anchor" href="#Synchronization-2">Synchronization</a><a class="docs-heading-anchor-permalink" href="#Synchronization-2" title="Permalink"></a></h3><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.sync" href="#CUDA.CG.sync"><code>CUDA.CG.sync</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">sync(group)</code></pre><p>Synchronize the threads named in the group, equivalent to calling <a href="#CUDA.CG.barrier_wait"><code>barrier_wait</code></a> and <a href="#CUDA.CG.barrier_arrive"><code>barrier_arrive</code></a> in sequence.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/cooperative_groups.jl#L292-L297">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.barrier_arrive" href="#CUDA.CG.barrier_arrive"><code>CUDA.CG.barrier_arrive</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">barrier_arrive(group)</code></pre><p>Arrive on the barrier, returns a token that needs to be passed into <a href="#CUDA.CG.barrier_wait"><code>barrier_wait</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/cooperative_groups.jl#L300-L304">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.barrier_wait" href="#CUDA.CG.barrier_wait"><code>CUDA.CG.barrier_wait</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">barrier_wait(group, token)</code></pre><p>Wait on the barrier, takes arrival token returned from <a href="#CUDA.CG.barrier_arrive"><code>barrier_arrive</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/cooperative_groups.jl#L307-L311">source</a></section></article><h2 id="Data-transfer"><a class="docs-heading-anchor" href="#Data-transfer">Data transfer</a><a id="Data-transfer-1"></a><a class="docs-heading-anchor-permalink" href="#Data-transfer" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.wait" href="#CUDA.CG.wait"><code>CUDA.CG.wait</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">wait(group)</code></pre><p>Make all threads in this group wait for all previously submitted <a href="#CUDA.CG.memcpy_async"><code>memcpy_async</code></a> operations to complete.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/cooperative_groups.jl#L506-L511">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.wait_prior" href="#CUDA.CG.wait_prior"><code>CUDA.CG.wait_prior</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">wait_prior(group, stage)</code></pre><p>Make all threads in this group wait for all but <code>stage</code> previously submitted <a href="#CUDA.CG.memcpy_async"><code>memcpy_async</code></a> operations to complete.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/cooperative_groups.jl#L517-L522">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.memcpy_async" href="#CUDA.CG.memcpy_async"><code>CUDA.CG.memcpy_async</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">memcpy_async(group, dst, src, bytes)</code></pre><p>Perform a group-wide collective memory copy from <code>src</code> to <code>dst</code> of <code>bytes</code> bytes. This operation may be performed asynchronously, so you should <a href="#CUDA.CG.wait"><code>wait</code></a> or <a href="#CUDA.CG.wait_prior"><code>wait_prior</code></a> before using the data. It is only supported by thread blocks and coalesced groups.</p><p>For this operation to be performed asynchronously, the following conditions must be met:</p><ul><li>the source and destination memory should be aligned to 4, 8 or 16 bytes. this will be deduced from the datatype, but can also be specified explicitly using <a href="#CUDA.align"><code>CUDA.align</code></a>.</li><li>the source should be global memory, and the destination should be shared memory.</li><li>the device should have compute capability 8.0 or higher.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/cooperative_groups.jl#L530-L544">source</a></section></article><h2 id="Math"><a class="docs-heading-anchor" href="#Math">Math</a><a id="Math-1"></a><a class="docs-heading-anchor-permalink" href="#Math" title="Permalink"></a></h2><p>Many mathematical functions are provided by the <code>libdevice</code> library, and are wrapped by CUDA.jl. These functions are used to implement well-known functions from the Julia standard library and packages like SpecialFunctions.jl, e.g., calling the <code>cos</code> function will automatically use <code>__nv_cos</code> from <code>libdevice</code> if possible.</p><p>Some functions do not have a counterpart in the Julia ecosystem, those have to be called directly. For example, to call <code>__nv_logb</code> or <code>__nv_logbf</code> you use <code>CUDA.logb</code> in a kernel.</p><p>For a list of available functions, look at <code>src/device/intrinsics/math.jl</code>.</p><h2 id="WMMA"><a class="docs-heading-anchor" href="#WMMA">WMMA</a><a id="WMMA-1"></a><a class="docs-heading-anchor-permalink" href="#WMMA" title="Permalink"></a></h2><p>Warp matrix multiply-accumulate (WMMA) is a CUDA API to access Tensor Cores, a new hardware feature in Volta GPUs to perform mixed precision matrix multiply-accumulate operations. The interface is split in two levels, both available in the WMMA submodule: low level wrappers around the LLVM intrinsics, and a higher-level API similar to that of CUDA C.</p><h3 id="LLVM-Intrinsics"><a class="docs-heading-anchor" href="#LLVM-Intrinsics">LLVM Intrinsics</a><a id="LLVM-Intrinsics-1"></a><a class="docs-heading-anchor-permalink" href="#LLVM-Intrinsics" title="Permalink"></a></h3><h4 id="Load-matrix"><a class="docs-heading-anchor" href="#Load-matrix">Load matrix</a><a id="Load-matrix-1"></a><a class="docs-heading-anchor-permalink" href="#Load-matrix" title="Permalink"></a></h4><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.WMMA.llvm_wmma_load" href="#CUDA.WMMA.llvm_wmma_load"><code>CUDA.WMMA.llvm_wmma_load</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">WMMA.llvm_wmma_load_{matrix}_{layout}_{shape}_{addr_space}_stride_{elem_type}(src_addr, stride)</code></pre><p>Wrapper around the LLVM intrinsic <code>@llvm.nvvm.wmma.load.{matrix}.sync.{layout}.{shape}.{addr_space}.stride.{elem_type}</code>.</p><p><strong>Arguments</strong></p><ul><li><code>src_addr</code>: The memory address to load from.</li><li><code>stride</code>: The leading dimension of the matrix, in numbers of elements.</li></ul><p><strong>Placeholders</strong></p><ul><li><code>{matrix}</code>: The matrix to load. Can be <code>a</code>, <code>b</code> or <code>c</code>.</li><li><code>{layout}</code>: The storage layout for the matrix. Can be <code>row</code> or <code>col</code>, for row major (C style) or column major (Julia style), respectively.</li><li><code>{shape}</code>: The overall shape of the MAC operation. Valid values are <code>m16n16k16</code>, <code>m32n8k16</code>, and <code>m8n32k16</code>.</li><li><code>{addr_space}</code>: The address space of <code>src_addr</code>. Can be empty (generic addressing), <code>shared</code> or <code>global</code>.</li><li><code>{elem_type}</code>: The type of each element in the matrix. For <code>a</code> and <code>b</code> matrices, valid values are <code>u8</code> (byte unsigned integer),               <code>s8</code> (byte signed integer), and <code>f16</code> (half precision floating point). For <code>c</code> and <code>d</code> matrices, valid values are               <code>s32</code> (32-bit signed integer), <code>f16</code> (half precision floating point), and <code>f32</code> (full precision floating point).</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/wmma.jl#L152-L169">source</a></section></article><h4 id="Perform-multiply-accumulate"><a class="docs-heading-anchor" href="#Perform-multiply-accumulate">Perform multiply-accumulate</a><a id="Perform-multiply-accumulate-1"></a><a class="docs-heading-anchor-permalink" href="#Perform-multiply-accumulate" title="Permalink"></a></h4><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.WMMA.llvm_wmma_mma" href="#CUDA.WMMA.llvm_wmma_mma"><code>CUDA.WMMA.llvm_wmma_mma</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">WMMA.llvm_wmma_mma_{a_layout}_{b_layout}_{shape}_{d_elem_type}_{c_elem_type}(a, b, c) or
-WMMA.llvm_wmma_mma_{a_layout}_{b_layout}_{shape}_{a_elem_type}(a, b, c)</code></pre><p>For floating point operations: wrapper around the LLVM intrinsic <code>@llvm.nvvm.wmma.mma.sync.{a_layout}.{b_layout}.{shape}.{d_elem_type}.{c_elem_type}</code> For all other operations: wrapper around the LLVM intrinsic <code>@llvm.nvvm.wmma.mma.sync.{a_layout}.{b_layout}.{shape}.{a_elem_type}</code></p><p><strong>Arguments</strong></p><ul><li><code>a</code>: The WMMA fragment corresponding to the matrix <span>$A$</span>.</li><li><code>b</code>: The WMMA fragment corresponding to the matrix <span>$B$</span>.</li><li><code>c</code>: The WMMA fragment corresponding to the matrix <span>$C$</span>.</li></ul><p><strong>Placeholders</strong></p><ul><li><code>{a_layout}</code>: The storage layout for matrix <span>$A$</span>. Can be <code>row</code> or <code>col</code>, for row major (C style) or column major (Julia style), respectively. Note that this must match the layout used in the load operation.</li><li><code>{b_layout}</code>: The storage layout for matrix <span>$B$</span>. Can be <code>row</code> or <code>col</code>, for row major (C style) or column major (Julia style), respectively. Note that this must match the layout used in the load operation.</li><li><code>{shape}</code>: The overall shape of the MAC operation. Valid values are <code>m16n16k16</code>, <code>m32n8k16</code>, and <code>m8n32k16</code>.</li><li><code>{a_elem_type}</code>: The type of each element in the <span>$A$</span> matrix. Valid values are <code>u8</code> (byte unsigned integer), <code>s8</code> (byte signed integer), and <code>f16</code> (half precision floating point).</li><li><code>{d_elem_type}</code>: The type of each element in the resultant <span>$D$</span> matrix. Valid values are <code>s32</code> (32-bit signed integer), <code>f16</code> (half precision floating point), and <code>f32</code> (full precision floating point).</li><li><code>{c_elem_type}</code>: The type of each element in the <span>$C$</span> matrix. Valid values are <code>s32</code> (32-bit signed integer), <code>f16</code> (half precision floating point), and <code>f32</code> (full precision floating point).</li></ul><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>Remember that the shape, type and layout of all operations (be it MMA, load or store) <strong>MUST</strong> match. Otherwise, the behaviour is undefined!</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/wmma.jl#L282-L306">source</a></section></article><h4 id="Store-matrix"><a class="docs-heading-anchor" href="#Store-matrix">Store matrix</a><a id="Store-matrix-1"></a><a class="docs-heading-anchor-permalink" href="#Store-matrix" title="Permalink"></a></h4><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.WMMA.llvm_wmma_store" href="#CUDA.WMMA.llvm_wmma_store"><code>CUDA.WMMA.llvm_wmma_store</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">WMMA.llvm_wmma_store_d_{layout}_{shape}_{addr_space}_stride_{elem_type}(dst_addr, data, stride)</code></pre><p>Wrapper around the LLVM intrinsic <code>@llvm.nvvm.wmma.store.d.sync.{layout}.{shape}.{addr_space}.stride.{elem_type}</code>.</p><p><strong>Arguments</strong></p><ul><li><code>dst_addr</code>: The memory address to store to.</li><li><code>data</code>: The <span>$D$</span> fragment to store.</li><li><code>stride</code>: The leading dimension of the matrix, in numbers of elements.</li></ul><p><strong>Placeholders</strong></p><ul><li><code>{layout}</code>: The storage layout for the matrix. Can be <code>row</code> or <code>col</code>, for row major (C style) or column major (Julia style), respectively.</li><li><code>{shape}</code>: The overall shape of the MAC operation. Valid values are <code>m16n16k16</code>, <code>m32n8k16</code>, and <code>m8n32k16</code>.</li><li><code>{addr_space}</code>: The address space of <code>src_addr</code>. Can be empty (generic addressing), <code>shared</code> or <code>global</code>.</li><li><code>{elem_type}</code>: The type of each element in the matrix. For <code>a</code> and <code>b</code> matrices, valid values are <code>u8</code> (byte unsigned integer),               <code>s8</code> (byte signed integer), and <code>f16</code> (half precision floating point). For <code>c</code> and <code>d</code> matrices, valid values are               <code>s32</code> (32-bit signed integer), <code>f16</code> (half precision floating point), and <code>f32</code> (full precision floating point).</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/wmma.jl#L216-L233">source</a></section></article><h3 id="CUDA-C-like-API"><a class="docs-heading-anchor" href="#CUDA-C-like-API">CUDA C-like API</a><a id="CUDA-C-like-API-1"></a><a class="docs-heading-anchor-permalink" href="#CUDA-C-like-API" title="Permalink"></a></h3><h4 id="Fragment"><a class="docs-heading-anchor" href="#Fragment">Fragment</a><a id="Fragment-1"></a><a class="docs-heading-anchor-permalink" href="#Fragment" title="Permalink"></a></h4><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.WMMA.RowMajor" href="#CUDA.WMMA.RowMajor"><code>CUDA.WMMA.RowMajor</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">WMMA.RowMajor</code></pre><p>Type that represents a matrix stored in row major (C style) order.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/wmma.jl#L415-L419">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.WMMA.ColMajor" href="#CUDA.WMMA.ColMajor"><code>CUDA.WMMA.ColMajor</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">WMMA.ColMajor</code></pre><p>Type that represents a matrix stored in column major (Julia style) order.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/wmma.jl#L422-L426">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.WMMA.Unspecified" href="#CUDA.WMMA.Unspecified"><code>CUDA.WMMA.Unspecified</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">WMMA.Unspecified</code></pre><p>Type that represents a matrix stored in an unspecified order.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>This storage format is not valid for all WMMA operations!</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/wmma.jl#L429-L437">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.WMMA.FragmentLayout" href="#CUDA.WMMA.FragmentLayout"><code>CUDA.WMMA.FragmentLayout</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">WMMA.FragmentLayout</code></pre><p>Abstract type that specifies the storage layout of a matrix.</p><p>Possible values are <a href="#CUDA.WMMA.RowMajor"><code>WMMA.RowMajor</code></a>, <a href="#CUDA.WMMA.ColMajor"><code>WMMA.ColMajor</code></a> and <a href="#CUDA.WMMA.Unspecified"><code>WMMA.Unspecified</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/wmma.jl#L406-L412">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.WMMA.Fragment" href="#CUDA.WMMA.Fragment"><code>CUDA.WMMA.Fragment</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">WMMA.Fragment</code></pre><p>Type that represents per-thread intermediate results of WMMA operations.</p><p>You can access individual elements using the <code>x</code> member or <code>[]</code> operator, but beware that the exact ordering of elements is unspecified.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/wmma.jl#L451-L457">source</a></section></article><h4 id="WMMA-configuration"><a class="docs-heading-anchor" href="#WMMA-configuration">WMMA configuration</a><a id="WMMA-configuration-1"></a><a class="docs-heading-anchor-permalink" href="#WMMA-configuration" title="Permalink"></a></h4><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.WMMA.Config" href="#CUDA.WMMA.Config"><code>CUDA.WMMA.Config</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">WMMA.Config{M, N, K, d_type}</code></pre><p>Type that contains all information for WMMA operations that cannot be inferred from the argument&#39;s types.</p><p>WMMA instructions calculate the matrix multiply-accumulate operation <span>$D = A \cdot B + C$</span>, where <span>$A$</span> is a <span>$M \times K$</span> matrix, <span>$B$</span> a <span>$K \times N$</span> matrix, and <span>$C$</span> and <span>$D$</span> are <span>$M \times N$</span> matrices.</p><p><code>d_type</code> refers to the type of the elements of matrix <span>$D$</span>, and can be either <code>Float16</code> or <code>Float32</code>.</p><p>All WMMA operations take a <code>Config</code> as their final argument.</p><p><strong>Examples</strong></p><pre><code class="language-julia-repl hljs">julia&gt; config = WMMA.Config{16, 16, 16, Float32}
-CUDA.WMMA.Config{16, 16, 16, Float32}</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/wmma.jl#L476-L493">source</a></section></article><h4 id="Load-matrix-2"><a class="docs-heading-anchor" href="#Load-matrix-2">Load matrix</a><a class="docs-heading-anchor-permalink" href="#Load-matrix-2" title="Permalink"></a></h4><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.WMMA.load_a" href="#CUDA.WMMA.load_a"><code>CUDA.WMMA.load_a</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">WMMA.load_a(addr, stride, layout, config)
+    @cuprint &quot;Hello, World $(42)\n&quot;</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/output.jl#L229">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.@cuprintf" href="#CUDA.@cuprintf"><code>CUDA.@cuprintf</code></a> — <span class="docstring-category">Macro</span></header><section><div><pre><code class="language-julia hljs">@cuprintf(&quot;%Fmt&quot;, args...)</code></pre><p>Print a formatted string in device context on the host standard output.</p><p>Note that this is not a fully C-compliant <code>printf</code> implementation; see the CUDA documentation for supported options and inputs.</p><p>Also beware that it is an untyped, and unforgiving <code>printf</code> implementation. Type widths need to match, eg. printing a 64-bit Julia integer requires the <code>%ld</code> formatting string.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/output.jl#L21-L31">source</a></section></article><h2 id="Assertions"><a class="docs-heading-anchor" href="#Assertions">Assertions</a><a id="Assertions-1"></a><a class="docs-heading-anchor-permalink" href="#Assertions" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.@cuassert" href="#CUDA.@cuassert"><code>CUDA.@cuassert</code></a> — <span class="docstring-category">Macro</span></header><section><div><pre><code class="language-julia hljs">@assert cond [text]</code></pre><p>Signal assertion failure to the CUDA driver if <code>cond</code> is <code>false</code>. Preferred syntax for writing assertions, mimicking <code>Base.@assert</code>. Message <code>text</code> is optionally displayed upon assertion failure.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>A failed assertion will crash the GPU, so use sparingly as a debugging tool. Furthermore, the assertion might be disabled at various optimization levels, and thus should not cause any side-effects.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/assertion.jl#L5-L16">source</a></section></article><h2 id="Atomics"><a class="docs-heading-anchor" href="#Atomics">Atomics</a><a id="Atomics-1"></a><a class="docs-heading-anchor-permalink" href="#Atomics" title="Permalink"></a></h2><p>A high-level macro is available to annotate expressions with:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.@atomic" href="#CUDA.@atomic"><code>CUDA.@atomic</code></a> — <span class="docstring-category">Macro</span></header><section><div><pre><code class="language-julia hljs">@atomic a[I] = op(a[I], val)
+@atomic a[I] ...= val</code></pre><p>Atomically perform a sequence of operations that loads an array element <code>a[I]</code>, performs the operation <code>op</code> on that value and a second value <code>val</code>, and writes the result back to the array. This sequence can be written out as a regular assignment, in which case the same array element should be used in the left and right hand side of the assignment, or as an in-place application of a known operator. In both cases, the array reference should be pure and not induce any side-effects.</p><div class="admonition is-category-warn"><header class="admonition-header">Warn</header><div class="admonition-body"><p>This interface is experimental, and might change without warning.  Use the lower-level <code>atomic_...!</code> functions for a stable API, albeit one limited to natively-supported ops.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/atomics.jl#L395-L409">source</a></section></article><p>If your expression is not recognized, or you need more control, use the underlying functions:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.atomic_cas!" href="#CUDA.atomic_cas!"><code>CUDA.atomic_cas!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">atomic_cas!(ptr::LLVMPtr{T}, cmp::T, val::T)</code></pre><p>Reads the value <code>old</code> located at address <code>ptr</code> and compare with <code>cmp</code>. If <code>old</code> equals to <code>cmp</code>, stores <code>val</code> at the same address. Otherwise, doesn&#39;t change the value <code>old</code>. These operations are performed in one atomic transaction. The function returns <code>old</code>.</p><p>This operation is supported for values of type Int32, Int64, UInt32 and UInt64. Additionally, on GPU hardware with compute capability 7.0+, values of type UInt16 are supported.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/atomics.jl#L236-L246">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.atomic_xchg!" href="#CUDA.atomic_xchg!"><code>CUDA.atomic_xchg!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">atomic_xchg!(ptr::LLVMPtr{T}, val::T)</code></pre><p>Reads the value <code>old</code> located at address <code>ptr</code> and stores <code>val</code> at the same address. These operations are performed in one atomic transaction. The function returns <code>old</code>.</p><p>This operation is supported for values of type Int32, Int64, UInt32 and UInt64.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/atomics.jl#L249-L256">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.atomic_add!" href="#CUDA.atomic_add!"><code>CUDA.atomic_add!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">atomic_add!(ptr::LLVMPtr{T}, val::T)</code></pre><p>Reads the value <code>old</code> located at address <code>ptr</code>, computes <code>old + val</code>, and stores the result back to memory at the same address. These operations are performed in one atomic transaction. The function returns <code>old</code>.</p><p>This operation is supported for values of type Int32, Int64, UInt32, UInt64, and Float32. Additionally, on GPU hardware with compute capability 6.0+, values of type Float64 are supported.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/atomics.jl#L259-L269">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.atomic_sub!" href="#CUDA.atomic_sub!"><code>CUDA.atomic_sub!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">atomic_sub!(ptr::LLVMPtr{T}, val::T)</code></pre><p>Reads the value <code>old</code> located at address <code>ptr</code>, computes <code>old - val</code>, and stores the result back to memory at the same address. These operations are performed in one atomic transaction. The function returns <code>old</code>.</p><p>This operation is supported for values of type Int32, Int64, UInt32 and UInt64.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/atomics.jl#L272-L280">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.atomic_and!" href="#CUDA.atomic_and!"><code>CUDA.atomic_and!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">atomic_and!(ptr::LLVMPtr{T}, val::T)</code></pre><p>Reads the value <code>old</code> located at address <code>ptr</code>, computes <code>old &amp; val</code>, and stores the result back to memory at the same address. These operations are performed in one atomic transaction. The function returns <code>old</code>.</p><p>This operation is supported for values of type Int32, Int64, UInt32 and UInt64.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/atomics.jl#L283-L291">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.atomic_or!" href="#CUDA.atomic_or!"><code>CUDA.atomic_or!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">atomic_or!(ptr::LLVMPtr{T}, val::T)</code></pre><p>Reads the value <code>old</code> located at address <code>ptr</code>, computes <code>old | val</code>, and stores the result back to memory at the same address. These operations are performed in one atomic transaction. The function returns <code>old</code>.</p><p>This operation is supported for values of type Int32, Int64, UInt32 and UInt64.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/atomics.jl#L294-L302">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.atomic_xor!" href="#CUDA.atomic_xor!"><code>CUDA.atomic_xor!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">atomic_xor!(ptr::LLVMPtr{T}, val::T)</code></pre><p>Reads the value <code>old</code> located at address <code>ptr</code>, computes <code>old ⊻ val</code>, and stores the result back to memory at the same address. These operations are performed in one atomic transaction. The function returns <code>old</code>.</p><p>This operation is supported for values of type Int32, Int64, UInt32 and UInt64.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/atomics.jl#L305-L313">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.atomic_min!" href="#CUDA.atomic_min!"><code>CUDA.atomic_min!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">atomic_min!(ptr::LLVMPtr{T}, val::T)</code></pre><p>Reads the value <code>old</code> located at address <code>ptr</code>, computes <code>min(old, val)</code>, and stores the result back to memory at the same address. These operations are performed in one atomic transaction. The function returns <code>old</code>.</p><p>This operation is supported for values of type Int32, Int64, UInt32 and UInt64.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/atomics.jl#L316-L324">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.atomic_max!" href="#CUDA.atomic_max!"><code>CUDA.atomic_max!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">atomic_max!(ptr::LLVMPtr{T}, val::T)</code></pre><p>Reads the value <code>old</code> located at address <code>ptr</code>, computes <code>max(old, val)</code>, and stores the result back to memory at the same address. These operations are performed in one atomic transaction. The function returns <code>old</code>.</p><p>This operation is supported for values of type Int32, Int64, UInt32 and UInt64.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/atomics.jl#L327-L335">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.atomic_inc!" href="#CUDA.atomic_inc!"><code>CUDA.atomic_inc!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">atomic_inc!(ptr::LLVMPtr{T}, val::T)</code></pre><p>Reads the value <code>old</code> located at address <code>ptr</code>, computes <code>((old &gt;= val) ? 0 : (old+1))</code>, and stores the result back to memory at the same address. These three operations are performed in one atomic transaction. The function returns <code>old</code>.</p><p>This operation is only supported for values of type Int32.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/atomics.jl#L338-L346">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.atomic_dec!" href="#CUDA.atomic_dec!"><code>CUDA.atomic_dec!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">atomic_dec!(ptr::LLVMPtr{T}, val::T)</code></pre><p>Reads the value <code>old</code> located at address <code>ptr</code>, computes <code>(((old == 0) | (old &gt; val)) ? val : (old-1) )</code>, and stores the result back to memory at the same address. These three operations are performed in one atomic transaction. The function returns <code>old</code>.</p><p>This operation is only supported for values of type Int32.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/atomics.jl#L349-L357">source</a></section></article><h2 id="Dynamic-parallelism"><a class="docs-heading-anchor" href="#Dynamic-parallelism">Dynamic parallelism</a><a id="Dynamic-parallelism-1"></a><a class="docs-heading-anchor-permalink" href="#Dynamic-parallelism" title="Permalink"></a></h2><p>Similarly to launching kernels from the host, you can use <code>@cuda</code> while passing <code>dynamic=true</code> for launching kernels from the device. A lower-level API is available as well:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.dynamic_cufunction" href="#CUDA.dynamic_cufunction"><code>CUDA.dynamic_cufunction</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">dynamic_cufunction(f, tt=Tuple{})</code></pre><p>Low-level interface to compile a function invocation for the currently-active GPU, returning a callable kernel object. Device-side equivalent of <a href="../compiler/#CUDA.cufunction"><code>CUDA.cufunction</code></a>.</p><p>No keyword arguments are supported.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/compiler/execution.jl#L409-L416">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.DeviceKernel" href="#CUDA.DeviceKernel"><code>CUDA.DeviceKernel</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">(::HostKernel)(args...; kwargs...)
+(::DeviceKernel)(args...; kwargs...)</code></pre><p>Low-level interface to call a compiled kernel, passing GPU-compatible arguments in <code>args</code>. For a higher-level interface, use <a href="../compiler/#CUDA.@cuda"><code>@cuda</code></a>.</p><p>A <code>HostKernel</code> is callable on the host, and a <code>DeviceKernel</code> is callable on the device (created by <code>@cuda</code> with <code>dynamic=true</code>).</p><p>The following keyword arguments are supported:</p><ul><li><code>threads</code> (default: <code>1</code>): Number of threads per block, or a 1-, 2- or 3-tuple of dimensions (e.g. <code>threads=(32, 32)</code> for a 2D block of 32×32 threads). Use <a href="#CUDA.threadIdx"><code>threadIdx()</code></a> and <a href="#CUDA.blockDim"><code>blockDim()</code></a> to query from within the kernel.</li><li><code>blocks</code> (default: <code>1</code>): Number of thread blocks to launch, or a 1-, 2- or 3-tuple of dimensions (e.g. <code>blocks=(2, 4, 2)</code> for a 3D grid of blocks). Use <a href="#CUDA.blockIdx"><code>blockIdx()</code></a> and <a href="#CUDA.gridDim"><code>gridDim()</code></a> to query from within the kernel.</li><li><code>shmem</code>(default: <code>0</code>): Amount of dynamic shared memory in bytes to allocate per thread block; used by <a href="#CUDA.CuDynamicSharedArray"><code>CuDynamicSharedArray</code></a>.</li><li><code>stream</code> (default: <a href="../essentials/#CUDA.stream"><code>stream()</code></a>): <a href="../../lib/driver/#CUDA.CuStream"><code>CuStream</code></a> to launch the kernel on.</li><li><code>cooperative</code> (default: <code>false</code>): whether to launch a cooperative kernel that supports grid synchronization (see <a href="#CUDA.CG.this_grid"><code>CG.this_grid</code></a> and <a href="#CUDA.CG.sync"><code>CG.sync</code></a>). Note that this requires care wrt. the number of blocks launched.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/compiler/execution.jl#L404">source</a></section></article><h2 id="Cooperative-groups"><a class="docs-heading-anchor" href="#Cooperative-groups">Cooperative groups</a><a id="Cooperative-groups-1"></a><a class="docs-heading-anchor-permalink" href="#Cooperative-groups" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG" href="#CUDA.CG"><code>CUDA.CG</code></a> — <span class="docstring-category">Module</span></header><section><div><p>CUDA.jl&#39;s cooperative groups implementation.</p><p>Cooperative groups in CUDA offer a structured approach to synchronize and communicate among threads. They allow developers to define specific groups of threads, providing a means to fine-tune inter-thread communication granularity. By offering a more nuanced alternative to traditional CUDA synchronization methods, cooperative groups enable a more controlled and efficient parallel decomposition in kernel design.</p><p>The following functionality is available in CUDA.jl:</p><ul><li>implicit groups: thread blocks, grid groups, and coalesced groups.</li><li>synchronization: <code>sync</code>, <code>barrier_arrive</code>, <code>barrier_wait</code></li><li>warp collectives for coalesced groups: shuffle and voting</li><li>data transfer: <code>memcpy_async</code>, <code>wait</code> and <code>wait_prior</code></li></ul><p>Noteworthy missing functionality:</p><ul><li>implicit groups: clusters, and multi-grid groups (which are deprecated)</li><li>explicit groups: tiling and partitioning</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/cooperative_groups.jl#L3-L23">source</a></section></article><h3 id="Group-construction-and-properties"><a class="docs-heading-anchor" href="#Group-construction-and-properties">Group construction and properties</a><a id="Group-construction-and-properties-1"></a><a class="docs-heading-anchor-permalink" href="#Group-construction-and-properties" title="Permalink"></a></h3><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.thread_rank" href="#CUDA.CG.thread_rank"><code>CUDA.CG.thread_rank</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">thread_rank(group)</code></pre><p>Returns the linearized rank of the calling thread along the interval <code>[1, num_threads()]</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/cooperative_groups.jl#L92-L96">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.num_threads" href="#CUDA.CG.num_threads"><code>CUDA.CG.num_threads</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">num_threads(group)</code></pre><p>Returns the total number of threads in the group.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/cooperative_groups.jl#L99-L103">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.thread_block" href="#CUDA.CG.thread_block"><code>CUDA.CG.thread_block</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">thread_block &lt;: thread_group</code></pre><p>Every GPU kernel is executed by a grid of thread blocks, and threads within each block are guaranteed to reside on the same streaming multiprocessor. A <code>thread_block</code> represents a thread block whose dimensions are not known until runtime.</p><p>Constructed via <a href="#CUDA.CG.this_thread_block"><code>this_thread_block</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/cooperative_groups.jl#L111-L119">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.this_thread_block" href="#CUDA.CG.this_thread_block"><code>CUDA.CG.this_thread_block</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">this_thread_block()</code></pre><p>Constructs a <code>thread_block</code> group</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/cooperative_groups.jl#L123-L127">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.group_index" href="#CUDA.CG.group_index"><code>CUDA.CG.group_index</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">group_index(tb::thread_block)</code></pre><p>3-Dimensional index of the block within the launched grid.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/cooperative_groups.jl#L132-L136">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.thread_index" href="#CUDA.CG.thread_index"><code>CUDA.CG.thread_index</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">thread_index(tb::thread_block)</code></pre><p>3-Dimensional index of the thread within the launched block.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/cooperative_groups.jl#L139-L143">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.dim_threads" href="#CUDA.CG.dim_threads"><code>CUDA.CG.dim_threads</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">dim_threads(tb::thread_block)</code></pre><p>Dimensions of the launched block in units of threads.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/cooperative_groups.jl#L146-L150">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.grid_group" href="#CUDA.CG.grid_group"><code>CUDA.CG.grid_group</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">grid_group &lt;: thread_group</code></pre><p>Threads within this this group are guaranteed to be co-resident on the same device within the same launched kernel. To use this group, the kernel must have been launched with <code>@cuda cooperative=true</code>, and the device must support it (queryable device attribute).</p><p>Constructed via <a href="#CUDA.CG.this_grid"><code>this_grid</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/cooperative_groups.jl#L161-L169">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.this_grid" href="#CUDA.CG.this_grid"><code>CUDA.CG.this_grid</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">this_grid()</code></pre><p>Constructs a <code>grid_group</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/cooperative_groups.jl#L174-L178">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.is_valid" href="#CUDA.CG.is_valid"><code>CUDA.CG.is_valid</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">is_valid(gg::grid_group)</code></pre><p>Returns whether the grid_group can synchronize</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/cooperative_groups.jl#L189-L193">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.block_rank" href="#CUDA.CG.block_rank"><code>CUDA.CG.block_rank</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">block_rank(gg::grid_group)</code></pre><p>Rank of the calling block within [0, num_blocks)</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/cooperative_groups.jl#L201-L205">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.num_blocks" href="#CUDA.CG.num_blocks"><code>CUDA.CG.num_blocks</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">num_blocks(gg::grid_group)</code></pre><p>Total number of blocks in the group.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/cooperative_groups.jl#L213-L217">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.dim_blocks" href="#CUDA.CG.dim_blocks"><code>CUDA.CG.dim_blocks</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">dim_blocks(gg::grid_group)</code></pre><p>Dimensions of the launched grid in units of blocks.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/cooperative_groups.jl#L223-L227">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.block_index" href="#CUDA.CG.block_index"><code>CUDA.CG.block_index</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">block_index(gg::grid_group)</code></pre><p>3-Dimensional index of the block within the launched grid.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/cooperative_groups.jl#L230-L234">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.coalesced_group" href="#CUDA.CG.coalesced_group"><code>CUDA.CG.coalesced_group</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">coalesced_group &lt;: thread_group</code></pre><p>A group representing the current set of converged threads in a warp. The size of the group is not guaranteed and it may return a group of only one thread (itself).</p><p>This group exposes warp-synchronous builtins. Constructed via <a href="#CUDA.CG.coalesced_threads"><code>coalesced_threads</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/cooperative_groups.jl#L242-L249">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.coalesced_threads" href="#CUDA.CG.coalesced_threads"><code>CUDA.CG.coalesced_threads</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">coalesced_threads()</code></pre><p>Constructs a <code>coalesced_group</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/cooperative_groups.jl#L259-L263">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.meta_group_rank" href="#CUDA.CG.meta_group_rank"><code>CUDA.CG.meta_group_rank</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">meta_group_rank(cg::coalesced_group)</code></pre><p>Rank of this group in the upper level of the hierarchy.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/cooperative_groups.jl#L270-L274">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.meta_group_size" href="#CUDA.CG.meta_group_size"><code>CUDA.CG.meta_group_size</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">meta_group_size(cg::coalesced_group)</code></pre><p>Total number of partitions created out of all CTAs when the group was created.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/cooperative_groups.jl#L277-L281">source</a></section></article><h3 id="Synchronization-2"><a class="docs-heading-anchor" href="#Synchronization-2">Synchronization</a><a class="docs-heading-anchor-permalink" href="#Synchronization-2" title="Permalink"></a></h3><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.sync" href="#CUDA.CG.sync"><code>CUDA.CG.sync</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">sync(group)</code></pre><p>Synchronize the threads named in the group, equivalent to calling <a href="#CUDA.CG.barrier_wait"><code>barrier_wait</code></a> and <a href="#CUDA.CG.barrier_arrive"><code>barrier_arrive</code></a> in sequence.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/cooperative_groups.jl#L292-L297">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.barrier_arrive" href="#CUDA.CG.barrier_arrive"><code>CUDA.CG.barrier_arrive</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">barrier_arrive(group)</code></pre><p>Arrive on the barrier, returns a token that needs to be passed into <a href="#CUDA.CG.barrier_wait"><code>barrier_wait</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/cooperative_groups.jl#L300-L304">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.barrier_wait" href="#CUDA.CG.barrier_wait"><code>CUDA.CG.barrier_wait</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">barrier_wait(group, token)</code></pre><p>Wait on the barrier, takes arrival token returned from <a href="#CUDA.CG.barrier_arrive"><code>barrier_arrive</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/cooperative_groups.jl#L307-L311">source</a></section></article><h2 id="Data-transfer"><a class="docs-heading-anchor" href="#Data-transfer">Data transfer</a><a id="Data-transfer-1"></a><a class="docs-heading-anchor-permalink" href="#Data-transfer" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.wait" href="#CUDA.CG.wait"><code>CUDA.CG.wait</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">wait(group)</code></pre><p>Make all threads in this group wait for all previously submitted <a href="#CUDA.CG.memcpy_async"><code>memcpy_async</code></a> operations to complete.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/cooperative_groups.jl#L506-L511">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.wait_prior" href="#CUDA.CG.wait_prior"><code>CUDA.CG.wait_prior</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">wait_prior(group, stage)</code></pre><p>Make all threads in this group wait for all but <code>stage</code> previously submitted <a href="#CUDA.CG.memcpy_async"><code>memcpy_async</code></a> operations to complete.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/cooperative_groups.jl#L517-L522">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CG.memcpy_async" href="#CUDA.CG.memcpy_async"><code>CUDA.CG.memcpy_async</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">memcpy_async(group, dst, src, bytes)</code></pre><p>Perform a group-wide collective memory copy from <code>src</code> to <code>dst</code> of <code>bytes</code> bytes. This operation may be performed asynchronously, so you should <a href="#CUDA.CG.wait"><code>wait</code></a> or <a href="#CUDA.CG.wait_prior"><code>wait_prior</code></a> before using the data. It is only supported by thread blocks and coalesced groups.</p><p>For this operation to be performed asynchronously, the following conditions must be met:</p><ul><li>the source and destination memory should be aligned to 4, 8 or 16 bytes. this will be deduced from the datatype, but can also be specified explicitly using <a href="#CUDA.align"><code>CUDA.align</code></a>.</li><li>the source should be global memory, and the destination should be shared memory.</li><li>the device should have compute capability 8.0 or higher.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/cooperative_groups.jl#L530-L544">source</a></section></article><h2 id="Math"><a class="docs-heading-anchor" href="#Math">Math</a><a id="Math-1"></a><a class="docs-heading-anchor-permalink" href="#Math" title="Permalink"></a></h2><p>Many mathematical functions are provided by the <code>libdevice</code> library, and are wrapped by CUDA.jl. These functions are used to implement well-known functions from the Julia standard library and packages like SpecialFunctions.jl, e.g., calling the <code>cos</code> function will automatically use <code>__nv_cos</code> from <code>libdevice</code> if possible.</p><p>Some functions do not have a counterpart in the Julia ecosystem, those have to be called directly. For example, to call <code>__nv_logb</code> or <code>__nv_logbf</code> you use <code>CUDA.logb</code> in a kernel.</p><p>For a list of available functions, look at <code>src/device/intrinsics/math.jl</code>.</p><h2 id="WMMA"><a class="docs-heading-anchor" href="#WMMA">WMMA</a><a id="WMMA-1"></a><a class="docs-heading-anchor-permalink" href="#WMMA" title="Permalink"></a></h2><p>Warp matrix multiply-accumulate (WMMA) is a CUDA API to access Tensor Cores, a new hardware feature in Volta GPUs to perform mixed precision matrix multiply-accumulate operations. The interface is split in two levels, both available in the WMMA submodule: low level wrappers around the LLVM intrinsics, and a higher-level API similar to that of CUDA C.</p><h3 id="LLVM-Intrinsics"><a class="docs-heading-anchor" href="#LLVM-Intrinsics">LLVM Intrinsics</a><a id="LLVM-Intrinsics-1"></a><a class="docs-heading-anchor-permalink" href="#LLVM-Intrinsics" title="Permalink"></a></h3><h4 id="Load-matrix"><a class="docs-heading-anchor" href="#Load-matrix">Load matrix</a><a id="Load-matrix-1"></a><a class="docs-heading-anchor-permalink" href="#Load-matrix" title="Permalink"></a></h4><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.WMMA.llvm_wmma_load" href="#CUDA.WMMA.llvm_wmma_load"><code>CUDA.WMMA.llvm_wmma_load</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">WMMA.llvm_wmma_load_{matrix}_{layout}_{shape}_{addr_space}_stride_{elem_type}(src_addr, stride)</code></pre><p>Wrapper around the LLVM intrinsic <code>@llvm.nvvm.wmma.load.{matrix}.sync.{layout}.{shape}.{addr_space}.stride.{elem_type}</code>.</p><p><strong>Arguments</strong></p><ul><li><code>src_addr</code>: The memory address to load from.</li><li><code>stride</code>: The leading dimension of the matrix, in numbers of elements.</li></ul><p><strong>Placeholders</strong></p><ul><li><code>{matrix}</code>: The matrix to load. Can be <code>a</code>, <code>b</code> or <code>c</code>.</li><li><code>{layout}</code>: The storage layout for the matrix. Can be <code>row</code> or <code>col</code>, for row major (C style) or column major (Julia style), respectively.</li><li><code>{shape}</code>: The overall shape of the MAC operation. Valid values are <code>m16n16k16</code>, <code>m32n8k16</code>, and <code>m8n32k16</code>.</li><li><code>{addr_space}</code>: The address space of <code>src_addr</code>. Can be empty (generic addressing), <code>shared</code> or <code>global</code>.</li><li><code>{elem_type}</code>: The type of each element in the matrix. For <code>a</code> and <code>b</code> matrices, valid values are <code>u8</code> (byte unsigned integer),               <code>s8</code> (byte signed integer), and <code>f16</code> (half precision floating point). For <code>c</code> and <code>d</code> matrices, valid values are               <code>s32</code> (32-bit signed integer), <code>f16</code> (half precision floating point), and <code>f32</code> (full precision floating point).</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/wmma.jl#L152-L169">source</a></section></article><h4 id="Perform-multiply-accumulate"><a class="docs-heading-anchor" href="#Perform-multiply-accumulate">Perform multiply-accumulate</a><a id="Perform-multiply-accumulate-1"></a><a class="docs-heading-anchor-permalink" href="#Perform-multiply-accumulate" title="Permalink"></a></h4><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.WMMA.llvm_wmma_mma" href="#CUDA.WMMA.llvm_wmma_mma"><code>CUDA.WMMA.llvm_wmma_mma</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">WMMA.llvm_wmma_mma_{a_layout}_{b_layout}_{shape}_{d_elem_type}_{c_elem_type}(a, b, c) or
+WMMA.llvm_wmma_mma_{a_layout}_{b_layout}_{shape}_{a_elem_type}(a, b, c)</code></pre><p>For floating point operations: wrapper around the LLVM intrinsic <code>@llvm.nvvm.wmma.mma.sync.{a_layout}.{b_layout}.{shape}.{d_elem_type}.{c_elem_type}</code> For all other operations: wrapper around the LLVM intrinsic <code>@llvm.nvvm.wmma.mma.sync.{a_layout}.{b_layout}.{shape}.{a_elem_type}</code></p><p><strong>Arguments</strong></p><ul><li><code>a</code>: The WMMA fragment corresponding to the matrix <span>$A$</span>.</li><li><code>b</code>: The WMMA fragment corresponding to the matrix <span>$B$</span>.</li><li><code>c</code>: The WMMA fragment corresponding to the matrix <span>$C$</span>.</li></ul><p><strong>Placeholders</strong></p><ul><li><code>{a_layout}</code>: The storage layout for matrix <span>$A$</span>. Can be <code>row</code> or <code>col</code>, for row major (C style) or column major (Julia style), respectively. Note that this must match the layout used in the load operation.</li><li><code>{b_layout}</code>: The storage layout for matrix <span>$B$</span>. Can be <code>row</code> or <code>col</code>, for row major (C style) or column major (Julia style), respectively. Note that this must match the layout used in the load operation.</li><li><code>{shape}</code>: The overall shape of the MAC operation. Valid values are <code>m16n16k16</code>, <code>m32n8k16</code>, and <code>m8n32k16</code>.</li><li><code>{a_elem_type}</code>: The type of each element in the <span>$A$</span> matrix. Valid values are <code>u8</code> (byte unsigned integer), <code>s8</code> (byte signed integer), and <code>f16</code> (half precision floating point).</li><li><code>{d_elem_type}</code>: The type of each element in the resultant <span>$D$</span> matrix. Valid values are <code>s32</code> (32-bit signed integer), <code>f16</code> (half precision floating point), and <code>f32</code> (full precision floating point).</li><li><code>{c_elem_type}</code>: The type of each element in the <span>$C$</span> matrix. Valid values are <code>s32</code> (32-bit signed integer), <code>f16</code> (half precision floating point), and <code>f32</code> (full precision floating point).</li></ul><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>Remember that the shape, type and layout of all operations (be it MMA, load or store) <strong>MUST</strong> match. Otherwise, the behaviour is undefined!</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/wmma.jl#L282-L306">source</a></section></article><h4 id="Store-matrix"><a class="docs-heading-anchor" href="#Store-matrix">Store matrix</a><a id="Store-matrix-1"></a><a class="docs-heading-anchor-permalink" href="#Store-matrix" title="Permalink"></a></h4><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.WMMA.llvm_wmma_store" href="#CUDA.WMMA.llvm_wmma_store"><code>CUDA.WMMA.llvm_wmma_store</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">WMMA.llvm_wmma_store_d_{layout}_{shape}_{addr_space}_stride_{elem_type}(dst_addr, data, stride)</code></pre><p>Wrapper around the LLVM intrinsic <code>@llvm.nvvm.wmma.store.d.sync.{layout}.{shape}.{addr_space}.stride.{elem_type}</code>.</p><p><strong>Arguments</strong></p><ul><li><code>dst_addr</code>: The memory address to store to.</li><li><code>data</code>: The <span>$D$</span> fragment to store.</li><li><code>stride</code>: The leading dimension of the matrix, in numbers of elements.</li></ul><p><strong>Placeholders</strong></p><ul><li><code>{layout}</code>: The storage layout for the matrix. Can be <code>row</code> or <code>col</code>, for row major (C style) or column major (Julia style), respectively.</li><li><code>{shape}</code>: The overall shape of the MAC operation. Valid values are <code>m16n16k16</code>, <code>m32n8k16</code>, and <code>m8n32k16</code>.</li><li><code>{addr_space}</code>: The address space of <code>src_addr</code>. Can be empty (generic addressing), <code>shared</code> or <code>global</code>.</li><li><code>{elem_type}</code>: The type of each element in the matrix. For <code>a</code> and <code>b</code> matrices, valid values are <code>u8</code> (byte unsigned integer),               <code>s8</code> (byte signed integer), and <code>f16</code> (half precision floating point). For <code>c</code> and <code>d</code> matrices, valid values are               <code>s32</code> (32-bit signed integer), <code>f16</code> (half precision floating point), and <code>f32</code> (full precision floating point).</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/wmma.jl#L216-L233">source</a></section></article><h3 id="CUDA-C-like-API"><a class="docs-heading-anchor" href="#CUDA-C-like-API">CUDA C-like API</a><a id="CUDA-C-like-API-1"></a><a class="docs-heading-anchor-permalink" href="#CUDA-C-like-API" title="Permalink"></a></h3><h4 id="Fragment"><a class="docs-heading-anchor" href="#Fragment">Fragment</a><a id="Fragment-1"></a><a class="docs-heading-anchor-permalink" href="#Fragment" title="Permalink"></a></h4><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.WMMA.RowMajor" href="#CUDA.WMMA.RowMajor"><code>CUDA.WMMA.RowMajor</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">WMMA.RowMajor</code></pre><p>Type that represents a matrix stored in row major (C style) order.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/wmma.jl#L415-L419">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.WMMA.ColMajor" href="#CUDA.WMMA.ColMajor"><code>CUDA.WMMA.ColMajor</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">WMMA.ColMajor</code></pre><p>Type that represents a matrix stored in column major (Julia style) order.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/wmma.jl#L422-L426">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.WMMA.Unspecified" href="#CUDA.WMMA.Unspecified"><code>CUDA.WMMA.Unspecified</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">WMMA.Unspecified</code></pre><p>Type that represents a matrix stored in an unspecified order.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>This storage format is not valid for all WMMA operations!</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/wmma.jl#L429-L437">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.WMMA.FragmentLayout" href="#CUDA.WMMA.FragmentLayout"><code>CUDA.WMMA.FragmentLayout</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">WMMA.FragmentLayout</code></pre><p>Abstract type that specifies the storage layout of a matrix.</p><p>Possible values are <a href="#CUDA.WMMA.RowMajor"><code>WMMA.RowMajor</code></a>, <a href="#CUDA.WMMA.ColMajor"><code>WMMA.ColMajor</code></a> and <a href="#CUDA.WMMA.Unspecified"><code>WMMA.Unspecified</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/wmma.jl#L406-L412">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.WMMA.Fragment" href="#CUDA.WMMA.Fragment"><code>CUDA.WMMA.Fragment</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">WMMA.Fragment</code></pre><p>Type that represents per-thread intermediate results of WMMA operations.</p><p>You can access individual elements using the <code>x</code> member or <code>[]</code> operator, but beware that the exact ordering of elements is unspecified.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/wmma.jl#L451-L457">source</a></section></article><h4 id="WMMA-configuration"><a class="docs-heading-anchor" href="#WMMA-configuration">WMMA configuration</a><a id="WMMA-configuration-1"></a><a class="docs-heading-anchor-permalink" href="#WMMA-configuration" title="Permalink"></a></h4><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.WMMA.Config" href="#CUDA.WMMA.Config"><code>CUDA.WMMA.Config</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">WMMA.Config{M, N, K, d_type}</code></pre><p>Type that contains all information for WMMA operations that cannot be inferred from the argument&#39;s types.</p><p>WMMA instructions calculate the matrix multiply-accumulate operation <span>$D = A \cdot B + C$</span>, where <span>$A$</span> is a <span>$M \times K$</span> matrix, <span>$B$</span> a <span>$K \times N$</span> matrix, and <span>$C$</span> and <span>$D$</span> are <span>$M \times N$</span> matrices.</p><p><code>d_type</code> refers to the type of the elements of matrix <span>$D$</span>, and can be either <code>Float16</code> or <code>Float32</code>.</p><p>All WMMA operations take a <code>Config</code> as their final argument.</p><p><strong>Examples</strong></p><pre><code class="language-julia-repl hljs">julia&gt; config = WMMA.Config{16, 16, 16, Float32}
+CUDA.WMMA.Config{16, 16, 16, Float32}</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/wmma.jl#L476-L493">source</a></section></article><h4 id="Load-matrix-2"><a class="docs-heading-anchor" href="#Load-matrix-2">Load matrix</a><a class="docs-heading-anchor-permalink" href="#Load-matrix-2" title="Permalink"></a></h4><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.WMMA.load_a" href="#CUDA.WMMA.load_a"><code>CUDA.WMMA.load_a</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">WMMA.load_a(addr, stride, layout, config)
 WMMA.load_b(addr, stride, layout, config)
-WMMA.load_c(addr, stride, layout, config)</code></pre><p>Load the matrix <code>a</code>, <code>b</code> or <code>c</code> from the memory location indicated by <code>addr</code>, and return the resulting <a href="#CUDA.WMMA.Fragment"><code>WMMA.Fragment</code></a>.</p><p><strong>Arguments</strong></p><ul><li><code>addr</code>: The address to load the matrix from.</li><li><code>stride</code>: The leading dimension of the matrix pointed to by <code>addr</code>, specified in number of elements.</li><li><code>layout</code>: The storage layout of the matrix. Possible values are <a href="#CUDA.WMMA.RowMajor"><code>WMMA.RowMajor</code></a> and <a href="#CUDA.WMMA.ColMajor"><code>WMMA.ColMajor</code></a>.</li><li><code>config</code>: The WMMA configuration that should be used for loading this matrix. See <a href="#CUDA.WMMA.Config"><code>WMMA.Config</code></a>.</li></ul><p>See also: <a href="#CUDA.WMMA.Fragment"><code>WMMA.Fragment</code></a>, <a href="#CUDA.WMMA.FragmentLayout"><code>WMMA.FragmentLayout</code></a>, <a href="#CUDA.WMMA.Config"><code>WMMA.Config</code></a></p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>All threads in a warp <strong>MUST</strong> execute the load operation in lockstep, and have to use exactly the same arguments. Failure to do so will result in undefined behaviour.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/wmma.jl#L577-L596">source</a></section></article><p><code>WMMA.load_b</code> and <code>WMMA.load_c</code> have the same signature.</p><h4 id="Perform-multiply-accumulate-2"><a class="docs-heading-anchor" href="#Perform-multiply-accumulate-2">Perform multiply-accumulate</a><a class="docs-heading-anchor-permalink" href="#Perform-multiply-accumulate-2" title="Permalink"></a></h4><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.WMMA.mma" href="#CUDA.WMMA.mma"><code>CUDA.WMMA.mma</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">WMMA.mma(a, b, c, conf)</code></pre><p>Perform the matrix multiply-accumulate operation <span>$D = A \cdot B + C$</span>.</p><p><strong>Arguments</strong></p><ul><li><code>a</code>: The <a href="#CUDA.WMMA.Fragment"><code>WMMA.Fragment</code></a> corresponding to the matrix <span>$A$</span>.</li><li><code>b</code>: The <a href="#CUDA.WMMA.Fragment"><code>WMMA.Fragment</code></a> corresponding to the matrix <span>$B$</span>.</li><li><code>c</code>: The <a href="#CUDA.WMMA.Fragment"><code>WMMA.Fragment</code></a> corresponding to the matrix <span>$C$</span>.</li><li><code>conf</code>: The <a href="#CUDA.WMMA.Config"><code>WMMA.Config</code></a> that should be used in this WMMA operation.</li></ul><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>All threads in a warp <strong>MUST</strong> execute the <code>mma</code> operation in lockstep, and have to use exactly the same arguments. Failure to do so will result in undefined behaviour.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/wmma.jl#L631-L647">source</a></section></article><h4 id="Store-matrix-2"><a class="docs-heading-anchor" href="#Store-matrix-2">Store matrix</a><a class="docs-heading-anchor-permalink" href="#Store-matrix-2" title="Permalink"></a></h4><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.WMMA.store_d" href="#CUDA.WMMA.store_d"><code>CUDA.WMMA.store_d</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">WMMA.store_d(addr, d, stride, layout, config)</code></pre><p>Store the result matrix <code>d</code> to the memory location indicated by <code>addr</code>.</p><p><strong>Arguments</strong></p><ul><li><code>addr</code>: The address to store the matrix to.</li><li><code>d</code>: The <a href="#CUDA.WMMA.Fragment"><code>WMMA.Fragment</code></a> corresponding to the <code>d</code> matrix.</li><li><code>stride</code>: The leading dimension of the matrix pointed to by <code>addr</code>, specified in number of elements.</li><li><code>layout</code>: The storage layout of the matrix. Possible values are <a href="#CUDA.WMMA.RowMajor"><code>WMMA.RowMajor</code></a> and <a href="#CUDA.WMMA.ColMajor"><code>WMMA.ColMajor</code></a>.</li><li><code>config</code>: The WMMA configuration that should be used for storing this matrix. See <a href="#CUDA.WMMA.Config"><code>WMMA.Config</code></a>.</li></ul><p>See also: <a href="#CUDA.WMMA.Fragment"><code>WMMA.Fragment</code></a>, <a href="#CUDA.WMMA.FragmentLayout"><code>WMMA.FragmentLayout</code></a>, <a href="#CUDA.WMMA.Config"><code>WMMA.Config</code></a></p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>All threads in a warp <strong>MUST</strong> execute the <code>store</code> operation in lockstep, and have to use exactly the same arguments. Failure to do so will result in undefined behaviour.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/wmma.jl#L686-L704">source</a></section></article><h4 id="Fill-fragment"><a class="docs-heading-anchor" href="#Fill-fragment">Fill fragment</a><a id="Fill-fragment-1"></a><a class="docs-heading-anchor-permalink" href="#Fill-fragment" title="Permalink"></a></h4><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.WMMA.fill_c" href="#CUDA.WMMA.fill_c"><code>CUDA.WMMA.fill_c</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">WMMA.fill_c(value, config)</code></pre><p>Return a <a href="#CUDA.WMMA.Fragment"><code>WMMA.Fragment</code></a> filled with the value <code>value</code>.</p><p>This operation is useful if you want to implement a matrix multiplication (and thus want to set <span>$C = O$</span>).</p><p><strong>Arguments</strong></p><ul><li><code>value</code>: The value used to fill the fragment. Can be a <code>Float16</code> or <code>Float32</code>.</li><li><code>config</code>: The WMMA configuration that should be used for this WMMA operation. See <a href="#CUDA.WMMA.Config"><code>WMMA.Config</code></a>.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/intrinsics/wmma.jl#L735-L745">source</a></section></article><h2 id="Other"><a class="docs-heading-anchor" href="#Other">Other</a><a id="Other-1"></a><a class="docs-heading-anchor-permalink" href="#Other" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.align" href="#CUDA.align"><code>CUDA.align</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CUDA.align{N}(obj)</code></pre><p>Construct an aligned object, providing alignment information to APIs that require it.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/device/utils.jl#L75-L79">source</a></section></article></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../array/">« Array programming</a><a class="docs-footer-nextpage" href="../compiler/">Compiler »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Monday 26 August 2024 07:38">Monday 26 August 2024</span>. Using Julia version 1.10.4.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+WMMA.load_c(addr, stride, layout, config)</code></pre><p>Load the matrix <code>a</code>, <code>b</code> or <code>c</code> from the memory location indicated by <code>addr</code>, and return the resulting <a href="#CUDA.WMMA.Fragment"><code>WMMA.Fragment</code></a>.</p><p><strong>Arguments</strong></p><ul><li><code>addr</code>: The address to load the matrix from.</li><li><code>stride</code>: The leading dimension of the matrix pointed to by <code>addr</code>, specified in number of elements.</li><li><code>layout</code>: The storage layout of the matrix. Possible values are <a href="#CUDA.WMMA.RowMajor"><code>WMMA.RowMajor</code></a> and <a href="#CUDA.WMMA.ColMajor"><code>WMMA.ColMajor</code></a>.</li><li><code>config</code>: The WMMA configuration that should be used for loading this matrix. See <a href="#CUDA.WMMA.Config"><code>WMMA.Config</code></a>.</li></ul><p>See also: <a href="#CUDA.WMMA.Fragment"><code>WMMA.Fragment</code></a>, <a href="#CUDA.WMMA.FragmentLayout"><code>WMMA.FragmentLayout</code></a>, <a href="#CUDA.WMMA.Config"><code>WMMA.Config</code></a></p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>All threads in a warp <strong>MUST</strong> execute the load operation in lockstep, and have to use exactly the same arguments. Failure to do so will result in undefined behaviour.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/wmma.jl#L577-L596">source</a></section></article><p><code>WMMA.load_b</code> and <code>WMMA.load_c</code> have the same signature.</p><h4 id="Perform-multiply-accumulate-2"><a class="docs-heading-anchor" href="#Perform-multiply-accumulate-2">Perform multiply-accumulate</a><a class="docs-heading-anchor-permalink" href="#Perform-multiply-accumulate-2" title="Permalink"></a></h4><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.WMMA.mma" href="#CUDA.WMMA.mma"><code>CUDA.WMMA.mma</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">WMMA.mma(a, b, c, conf)</code></pre><p>Perform the matrix multiply-accumulate operation <span>$D = A \cdot B + C$</span>.</p><p><strong>Arguments</strong></p><ul><li><code>a</code>: The <a href="#CUDA.WMMA.Fragment"><code>WMMA.Fragment</code></a> corresponding to the matrix <span>$A$</span>.</li><li><code>b</code>: The <a href="#CUDA.WMMA.Fragment"><code>WMMA.Fragment</code></a> corresponding to the matrix <span>$B$</span>.</li><li><code>c</code>: The <a href="#CUDA.WMMA.Fragment"><code>WMMA.Fragment</code></a> corresponding to the matrix <span>$C$</span>.</li><li><code>conf</code>: The <a href="#CUDA.WMMA.Config"><code>WMMA.Config</code></a> that should be used in this WMMA operation.</li></ul><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>All threads in a warp <strong>MUST</strong> execute the <code>mma</code> operation in lockstep, and have to use exactly the same arguments. Failure to do so will result in undefined behaviour.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/wmma.jl#L631-L647">source</a></section></article><h4 id="Store-matrix-2"><a class="docs-heading-anchor" href="#Store-matrix-2">Store matrix</a><a class="docs-heading-anchor-permalink" href="#Store-matrix-2" title="Permalink"></a></h4><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.WMMA.store_d" href="#CUDA.WMMA.store_d"><code>CUDA.WMMA.store_d</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">WMMA.store_d(addr, d, stride, layout, config)</code></pre><p>Store the result matrix <code>d</code> to the memory location indicated by <code>addr</code>.</p><p><strong>Arguments</strong></p><ul><li><code>addr</code>: The address to store the matrix to.</li><li><code>d</code>: The <a href="#CUDA.WMMA.Fragment"><code>WMMA.Fragment</code></a> corresponding to the <code>d</code> matrix.</li><li><code>stride</code>: The leading dimension of the matrix pointed to by <code>addr</code>, specified in number of elements.</li><li><code>layout</code>: The storage layout of the matrix. Possible values are <a href="#CUDA.WMMA.RowMajor"><code>WMMA.RowMajor</code></a> and <a href="#CUDA.WMMA.ColMajor"><code>WMMA.ColMajor</code></a>.</li><li><code>config</code>: The WMMA configuration that should be used for storing this matrix. See <a href="#CUDA.WMMA.Config"><code>WMMA.Config</code></a>.</li></ul><p>See also: <a href="#CUDA.WMMA.Fragment"><code>WMMA.Fragment</code></a>, <a href="#CUDA.WMMA.FragmentLayout"><code>WMMA.FragmentLayout</code></a>, <a href="#CUDA.WMMA.Config"><code>WMMA.Config</code></a></p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>All threads in a warp <strong>MUST</strong> execute the <code>store</code> operation in lockstep, and have to use exactly the same arguments. Failure to do so will result in undefined behaviour.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/wmma.jl#L686-L704">source</a></section></article><h4 id="Fill-fragment"><a class="docs-heading-anchor" href="#Fill-fragment">Fill fragment</a><a id="Fill-fragment-1"></a><a class="docs-heading-anchor-permalink" href="#Fill-fragment" title="Permalink"></a></h4><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.WMMA.fill_c" href="#CUDA.WMMA.fill_c"><code>CUDA.WMMA.fill_c</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">WMMA.fill_c(value, config)</code></pre><p>Return a <a href="#CUDA.WMMA.Fragment"><code>WMMA.Fragment</code></a> filled with the value <code>value</code>.</p><p>This operation is useful if you want to implement a matrix multiplication (and thus want to set <span>$C = O$</span>).</p><p><strong>Arguments</strong></p><ul><li><code>value</code>: The value used to fill the fragment. Can be a <code>Float16</code> or <code>Float32</code>.</li><li><code>config</code>: The WMMA configuration that should be used for this WMMA operation. See <a href="#CUDA.WMMA.Config"><code>WMMA.Config</code></a>.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/intrinsics/wmma.jl#L735-L745">source</a></section></article><h2 id="Other"><a class="docs-heading-anchor" href="#Other">Other</a><a id="Other-1"></a><a class="docs-heading-anchor-permalink" href="#Other" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.align" href="#CUDA.align"><code>CUDA.align</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CUDA.align{N}(obj)</code></pre><p>Construct an aligned object, providing alignment information to APIs that require it.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/device/utils.jl#L75-L79">source</a></section></article></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../array/">« Array programming</a><a class="docs-footer-nextpage" href="../compiler/">Compiler »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Wednesday 4 September 2024 18:14">Wednesday 4 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/development/debugging/index.html b/dev/development/debugging/index.html
index f539aa0fa8..9adc8ed8ad 100644
--- a/dev/development/debugging/index.html
+++ b/dev/development/debugging/index.html
@@ -44,4 +44,4 @@
 
 julia&gt; exit()
 ========= ERROR SUMMARY: 0 errors
-Process(`.julia/artifacts/feb6b469b6047f344fec54df2619d65f6b704bdb/cuda/compute-sanitizer/compute-sanitizer --launch-timeout=0 --target-processes=all --report-api-errors=no julia`, ProcessExited(0))</code></pre><p>By default, <code>compute-sanitizer</code> launches the <code>memcheck</code> tool, which is great for dealing with memory issues. Other tools can be selected with the <code>--tool</code> argument, e.g., to find thread synchronization hazards use <code>--tool synccheck</code>, <code>racecheck</code> can be used to find shared memory data races, and <code>initcheck</code> is useful for spotting uses of uninitialized device memory.</p><h2 id="cuda-gdb"><a class="docs-heading-anchor" href="#cuda-gdb"><code>cuda-gdb</code></a><a id="cuda-gdb-1"></a><a class="docs-heading-anchor-permalink" href="#cuda-gdb" title="Permalink"></a></h2><p>To debug Julia code, you can use the CUDA debugger <code>cuda-gdb</code>. When using this tool, it is recommended to enable Julia debug mode 2 so that debug information is emitted. Do note that the DWARF info emitted by Julia is currently insufficient to e.g. inspect variables, so the debug experience will not be pleasant.</p><p>If you encounter the <code>CUDBG_ERROR_UNINITIALIZED</code> error, ensure all your devices are supported by <code>cuda-gdb</code> (e.g., Kepler-era devices aren&#39;t). If some aren&#39;t, re-start Julia with <code>CUDA_VISIBLE_DEVICES</code> set to ignore that device.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../troubleshooting/">« Troubleshooting</a><a class="docs-footer-nextpage" href="../../api/essentials/">Essentials »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Monday 26 August 2024 07:38">Monday 26 August 2024</span>. Using Julia version 1.10.4.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+Process(`.julia/artifacts/feb6b469b6047f344fec54df2619d65f6b704bdb/cuda/compute-sanitizer/compute-sanitizer --launch-timeout=0 --target-processes=all --report-api-errors=no julia`, ProcessExited(0))</code></pre><p>By default, <code>compute-sanitizer</code> launches the <code>memcheck</code> tool, which is great for dealing with memory issues. Other tools can be selected with the <code>--tool</code> argument, e.g., to find thread synchronization hazards use <code>--tool synccheck</code>, <code>racecheck</code> can be used to find shared memory data races, and <code>initcheck</code> is useful for spotting uses of uninitialized device memory.</p><h2 id="cuda-gdb"><a class="docs-heading-anchor" href="#cuda-gdb"><code>cuda-gdb</code></a><a id="cuda-gdb-1"></a><a class="docs-heading-anchor-permalink" href="#cuda-gdb" title="Permalink"></a></h2><p>To debug Julia code, you can use the CUDA debugger <code>cuda-gdb</code>. When using this tool, it is recommended to enable Julia debug mode 2 so that debug information is emitted. Do note that the DWARF info emitted by Julia is currently insufficient to e.g. inspect variables, so the debug experience will not be pleasant.</p><p>If you encounter the <code>CUDBG_ERROR_UNINITIALIZED</code> error, ensure all your devices are supported by <code>cuda-gdb</code> (e.g., Kepler-era devices aren&#39;t). If some aren&#39;t, re-start Julia with <code>CUDA_VISIBLE_DEVICES</code> set to ignore that device.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../troubleshooting/">« Troubleshooting</a><a class="docs-footer-nextpage" href="../../api/essentials/">Essentials »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Wednesday 4 September 2024 18:14">Wednesday 4 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/development/kernel/index.html b/dev/development/kernel/index.html
index cd76e3aba4..87b63caadf 100644
--- a/dev/development/kernel/index.html
+++ b/dev/development/kernel/index.html
@@ -191,4 +191,4 @@
  1
  2
  3
- 4</code></pre><p>The above example waits for the copy to complete before continuing, but it is also possible to have multiple copies in flight using the <code>CG.wait_prior</code> function, which waits for all but the last N copies to complete.</p><h2 id="Warp-matrix-multiply-accumulate"><a class="docs-heading-anchor" href="#Warp-matrix-multiply-accumulate">Warp matrix multiply-accumulate</a><a id="Warp-matrix-multiply-accumulate-1"></a><a class="docs-heading-anchor-permalink" href="#Warp-matrix-multiply-accumulate" title="Permalink"></a></h2><p>Warp matrix multiply-accumulate (WMMA) is a cooperative operation to perform mixed precision matrix multiply-accumulate on the tensor core hardware of recent GPUs. The CUDA.jl interface is split in two levels, both available in the WMMA submodule: low level wrappers around the LLVM intrinsics, and a higher-level API similar to that of CUDA C.</p><h3 id="Terminology"><a class="docs-heading-anchor" href="#Terminology">Terminology</a><a id="Terminology-1"></a><a class="docs-heading-anchor-permalink" href="#Terminology" title="Permalink"></a></h3><p>The WMMA operations perform a matrix multiply-accumulate. More concretely, it calculates <span>$D = A \cdot B + C$</span>, where <span>$A$</span> is a <span>$M \times K$</span> matrix, <span>$B$</span> is a <span>$K \times N$</span> matrix, and <span>$C$</span> and <span>$D$</span> are <span>$M \times N$</span> matrices.</p><p>However, not all values of <span>$M$</span>, <span>$N$</span> and <span>$K$</span> are allowed. The tuple <span>$(M, N, K)$</span> is often called the &quot;shape&quot; of the multiply accumulate operation.</p><p>The multiply-accumulate consists of the following steps:</p><ul><li>Load the matrices <span>$A$</span>, <span>$B$</span> and <span>$C$</span> from memory to registers using a WMMA load operation.</li><li>Perform the matrix multiply-accumulate of <span>$A$</span>, <span>$B$</span> and <span>$C$</span> to obtain <span>$D$</span> using a WMMA MMA operation. <span>$D$</span> is stored in hardware registers after this step.</li><li>Store the result <span>$D$</span> back to memory using a WMMA store operation.</li></ul><p>Note that WMMA is a warp-wide operation, which means that all threads in a warp must cooperate, and execute the WMMA operations in lockstep. Failure to do so will result in undefined behaviour.</p><p>Each thread in a warp will hold a part of the matrix in its registers. In WMMA parlance, this part is referred to as a &quot;fragment&quot;. Note that the exact mapping between matrix elements and fragment is unspecified, and subject to change in future versions.</p><p>Finally, it is important to note that the resultant <span>$D$</span> matrix can be used as a <span>$C$</span> matrix for a subsequent multiply-accumulate. This is useful if one needs to calculate a sum of the form <span>$\sum_{i=0}^{n} A_i B_i$</span>, where <span>$A_i$</span> and <span>$B_i$</span> are matrices of the correct dimension.</p><h3 id="LLVM-Intrinsics"><a class="docs-heading-anchor" href="#LLVM-Intrinsics">LLVM Intrinsics</a><a id="LLVM-Intrinsics-1"></a><a class="docs-heading-anchor-permalink" href="#LLVM-Intrinsics" title="Permalink"></a></h3><p>The LLVM intrinsics are accessible by using the one-to-one Julia wrappers. The return type of each wrapper is the Julia type that corresponds closest to the return type of the LLVM intrinsic. For example, LLVM&#39;s <code>[8 x &lt;2 x half&gt;]</code> becomes <code>NTuple{8, NTuple{2, VecElement{Float16}}}</code> in Julia. In essence, these wrappers return the SSA values returned by the LLVM intrinsic. Currently, all intrinsics that are available in LLVM 6, PTX 6.0 and SM 70 are implemented.</p><p>These LLVM intrinsics are then lowered to the correct PTX instructions by the LLVM NVPTX backend. For more information about the PTX instructions, please refer to the <a href="https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions">PTX Instruction Set Architecture Manual</a>.</p><p>The LLVM intrinsics are subdivided in three categories:</p><ul><li>load: <code>WMMA.llvm_wmma_load</code></li><li>multiply-accumulate: <code>WMMA.llvm_wmma_mma</code></li><li>store: <code>WMMA.llvm_wmma_store</code></li></ul><h3 id="CUDA-C-like-API"><a class="docs-heading-anchor" href="#CUDA-C-like-API">CUDA C-like API</a><a id="CUDA-C-like-API-1"></a><a class="docs-heading-anchor-permalink" href="#CUDA-C-like-API" title="Permalink"></a></h3><p>The main difference between the CUDA C-like API and the lower level wrappers, is that the former enforces several constraints when working with WMMA. For example, it ensures that the <span>$A$</span> fragment argument to the MMA instruction was obtained by a <code>load_a</code> call, and not by a <code>load_b</code> or <code>load_c</code>. Additionally, it makes sure that the data type and storage layout of the load/store operations and the MMA operation match.</p><p>The CUDA C-like API heavily uses Julia&#39;s dispatch mechanism. As such, the method names are much shorter than the LLVM intrinsic wrappers, as most information is baked into the type of the arguments rather than the method name.</p><p>Note that, in CUDA C++, the fragment is responsible for both the storage of intermediate results and the WMMA configuration. All CUDA C++ WMMA calls are function templates that take the resultant fragment as a by-reference argument. As a result, the type of this argument can be used during overload resolution to select the correct WMMA instruction to call.</p><p>In contrast, the API in Julia separates the WMMA storage (<a href="../../api/kernel/#CUDA.WMMA.Fragment"><code>WMMA.Fragment</code></a>) and configuration (<a href="../../api/kernel/#CUDA.WMMA.Config"><code>WMMA.Config</code></a>). Instead of taking the resultant fragment by reference, the Julia functions just return it. This makes the dataflow clearer, but it also means that the type of that fragment cannot be used for selection of the correct WMMA instruction. Thus, there is still a limited amount of information that cannot be inferred from the argument types, but must nonetheless match for all WMMA operations, such as the overall shape of the MMA. This is accomplished by a separate &quot;WMMA configuration&quot; (see <a href="../../api/kernel/#CUDA.WMMA.Config"><code>WMMA.Config</code></a>) that you create once, and then give as an argument to all intrinsics.</p><ul><li>fragment: <code>WMMA.Fragment</code></li><li>configuration: <code>WMMA.Config</code></li><li>load: <code>WMMA.load_a</code>, <code>WMMA.load_b</code>, <code>WMMA.load_c</code></li><li>fill: <code>WMMA.fill_c</code></li><li>multiply-accumulate: <code>WMMA.mma</code></li><li>store: <code>WMMA.store_d</code></li></ul><h4 id="Element-access-and-broadcasting"><a class="docs-heading-anchor" href="#Element-access-and-broadcasting">Element access and broadcasting</a><a id="Element-access-and-broadcasting-1"></a><a class="docs-heading-anchor-permalink" href="#Element-access-and-broadcasting" title="Permalink"></a></h4><p>Similar to the CUDA C++ WMMA API, <a href="../../api/kernel/#CUDA.WMMA.Fragment"><code>WMMA.Fragment</code></a>s have an <code>x</code> member that can be used to access individual elements. Note that, in contrast to the values returned by the LLVM intrinsics, the <code>x</code> member is flattened. For example, while the <code>Float16</code> variants of the <code>load_a</code> instrinsics return <code>NTuple{8, NTuple{2, VecElement{Float16}}}</code>, the <code>x</code> member has type <code>NTuple{16, Float16}</code>.</p><p>Typically, you will only need to access the <code>x</code> member to perform elementwise operations. This can be more succinctly expressed using Julia&#39;s broadcast mechanism. For example, to double each element in a fragment, you can simply use:</p><pre><code class="language-julia hljs">frag = 2.0f0 .* frag</code></pre></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../profiling/">« Benchmarking &amp; profiling</a><a class="docs-footer-nextpage" href="../troubleshooting/">Troubleshooting »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Monday 26 August 2024 07:38">Monday 26 August 2024</span>. Using Julia version 1.10.4.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+ 4</code></pre><p>The above example waits for the copy to complete before continuing, but it is also possible to have multiple copies in flight using the <code>CG.wait_prior</code> function, which waits for all but the last N copies to complete.</p><h2 id="Warp-matrix-multiply-accumulate"><a class="docs-heading-anchor" href="#Warp-matrix-multiply-accumulate">Warp matrix multiply-accumulate</a><a id="Warp-matrix-multiply-accumulate-1"></a><a class="docs-heading-anchor-permalink" href="#Warp-matrix-multiply-accumulate" title="Permalink"></a></h2><p>Warp matrix multiply-accumulate (WMMA) is a cooperative operation to perform mixed precision matrix multiply-accumulate on the tensor core hardware of recent GPUs. The CUDA.jl interface is split in two levels, both available in the WMMA submodule: low level wrappers around the LLVM intrinsics, and a higher-level API similar to that of CUDA C.</p><h3 id="Terminology"><a class="docs-heading-anchor" href="#Terminology">Terminology</a><a id="Terminology-1"></a><a class="docs-heading-anchor-permalink" href="#Terminology" title="Permalink"></a></h3><p>The WMMA operations perform a matrix multiply-accumulate. More concretely, it calculates <span>$D = A \cdot B + C$</span>, where <span>$A$</span> is a <span>$M \times K$</span> matrix, <span>$B$</span> is a <span>$K \times N$</span> matrix, and <span>$C$</span> and <span>$D$</span> are <span>$M \times N$</span> matrices.</p><p>However, not all values of <span>$M$</span>, <span>$N$</span> and <span>$K$</span> are allowed. The tuple <span>$(M, N, K)$</span> is often called the &quot;shape&quot; of the multiply accumulate operation.</p><p>The multiply-accumulate consists of the following steps:</p><ul><li>Load the matrices <span>$A$</span>, <span>$B$</span> and <span>$C$</span> from memory to registers using a WMMA load operation.</li><li>Perform the matrix multiply-accumulate of <span>$A$</span>, <span>$B$</span> and <span>$C$</span> to obtain <span>$D$</span> using a WMMA MMA operation. <span>$D$</span> is stored in hardware registers after this step.</li><li>Store the result <span>$D$</span> back to memory using a WMMA store operation.</li></ul><p>Note that WMMA is a warp-wide operation, which means that all threads in a warp must cooperate, and execute the WMMA operations in lockstep. Failure to do so will result in undefined behaviour.</p><p>Each thread in a warp will hold a part of the matrix in its registers. In WMMA parlance, this part is referred to as a &quot;fragment&quot;. Note that the exact mapping between matrix elements and fragment is unspecified, and subject to change in future versions.</p><p>Finally, it is important to note that the resultant <span>$D$</span> matrix can be used as a <span>$C$</span> matrix for a subsequent multiply-accumulate. This is useful if one needs to calculate a sum of the form <span>$\sum_{i=0}^{n} A_i B_i$</span>, where <span>$A_i$</span> and <span>$B_i$</span> are matrices of the correct dimension.</p><h3 id="LLVM-Intrinsics"><a class="docs-heading-anchor" href="#LLVM-Intrinsics">LLVM Intrinsics</a><a id="LLVM-Intrinsics-1"></a><a class="docs-heading-anchor-permalink" href="#LLVM-Intrinsics" title="Permalink"></a></h3><p>The LLVM intrinsics are accessible by using the one-to-one Julia wrappers. The return type of each wrapper is the Julia type that corresponds closest to the return type of the LLVM intrinsic. For example, LLVM&#39;s <code>[8 x &lt;2 x half&gt;]</code> becomes <code>NTuple{8, NTuple{2, VecElement{Float16}}}</code> in Julia. In essence, these wrappers return the SSA values returned by the LLVM intrinsic. Currently, all intrinsics that are available in LLVM 6, PTX 6.0 and SM 70 are implemented.</p><p>These LLVM intrinsics are then lowered to the correct PTX instructions by the LLVM NVPTX backend. For more information about the PTX instructions, please refer to the <a href="https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions">PTX Instruction Set Architecture Manual</a>.</p><p>The LLVM intrinsics are subdivided in three categories:</p><ul><li>load: <code>WMMA.llvm_wmma_load</code></li><li>multiply-accumulate: <code>WMMA.llvm_wmma_mma</code></li><li>store: <code>WMMA.llvm_wmma_store</code></li></ul><h3 id="CUDA-C-like-API"><a class="docs-heading-anchor" href="#CUDA-C-like-API">CUDA C-like API</a><a id="CUDA-C-like-API-1"></a><a class="docs-heading-anchor-permalink" href="#CUDA-C-like-API" title="Permalink"></a></h3><p>The main difference between the CUDA C-like API and the lower level wrappers, is that the former enforces several constraints when working with WMMA. For example, it ensures that the <span>$A$</span> fragment argument to the MMA instruction was obtained by a <code>load_a</code> call, and not by a <code>load_b</code> or <code>load_c</code>. Additionally, it makes sure that the data type and storage layout of the load/store operations and the MMA operation match.</p><p>The CUDA C-like API heavily uses Julia&#39;s dispatch mechanism. As such, the method names are much shorter than the LLVM intrinsic wrappers, as most information is baked into the type of the arguments rather than the method name.</p><p>Note that, in CUDA C++, the fragment is responsible for both the storage of intermediate results and the WMMA configuration. All CUDA C++ WMMA calls are function templates that take the resultant fragment as a by-reference argument. As a result, the type of this argument can be used during overload resolution to select the correct WMMA instruction to call.</p><p>In contrast, the API in Julia separates the WMMA storage (<a href="../../api/kernel/#CUDA.WMMA.Fragment"><code>WMMA.Fragment</code></a>) and configuration (<a href="../../api/kernel/#CUDA.WMMA.Config"><code>WMMA.Config</code></a>). Instead of taking the resultant fragment by reference, the Julia functions just return it. This makes the dataflow clearer, but it also means that the type of that fragment cannot be used for selection of the correct WMMA instruction. Thus, there is still a limited amount of information that cannot be inferred from the argument types, but must nonetheless match for all WMMA operations, such as the overall shape of the MMA. This is accomplished by a separate &quot;WMMA configuration&quot; (see <a href="../../api/kernel/#CUDA.WMMA.Config"><code>WMMA.Config</code></a>) that you create once, and then give as an argument to all intrinsics.</p><ul><li>fragment: <code>WMMA.Fragment</code></li><li>configuration: <code>WMMA.Config</code></li><li>load: <code>WMMA.load_a</code>, <code>WMMA.load_b</code>, <code>WMMA.load_c</code></li><li>fill: <code>WMMA.fill_c</code></li><li>multiply-accumulate: <code>WMMA.mma</code></li><li>store: <code>WMMA.store_d</code></li></ul><h4 id="Element-access-and-broadcasting"><a class="docs-heading-anchor" href="#Element-access-and-broadcasting">Element access and broadcasting</a><a id="Element-access-and-broadcasting-1"></a><a class="docs-heading-anchor-permalink" href="#Element-access-and-broadcasting" title="Permalink"></a></h4><p>Similar to the CUDA C++ WMMA API, <a href="../../api/kernel/#CUDA.WMMA.Fragment"><code>WMMA.Fragment</code></a>s have an <code>x</code> member that can be used to access individual elements. Note that, in contrast to the values returned by the LLVM intrinsics, the <code>x</code> member is flattened. For example, while the <code>Float16</code> variants of the <code>load_a</code> instrinsics return <code>NTuple{8, NTuple{2, VecElement{Float16}}}</code>, the <code>x</code> member has type <code>NTuple{16, Float16}</code>.</p><p>Typically, you will only need to access the <code>x</code> member to perform elementwise operations. This can be more succinctly expressed using Julia&#39;s broadcast mechanism. For example, to double each element in a fragment, you can simply use:</p><pre><code class="language-julia hljs">frag = 2.0f0 .* frag</code></pre></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../profiling/">« Benchmarking &amp; profiling</a><a class="docs-footer-nextpage" href="../troubleshooting/">Troubleshooting »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Wednesday 4 September 2024 18:14">Wednesday 4 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/development/profiling/index.html b/dev/development/profiling/index.html
index 76396ee350..e7b669831d 100644
--- a/dev/development/profiling/index.html
+++ b/dev/development/profiling/index.html
@@ -96,4 +96,4 @@
 
 NVTX.@annotate function foo()
     ...
-end</code></pre><p>For more details, refer to the documentation of the NVTX.jl package.</p><h2 id="Compiler-options"><a class="docs-heading-anchor" href="#Compiler-options">Compiler options</a><a id="Compiler-options-1"></a><a class="docs-heading-anchor-permalink" href="#Compiler-options" title="Permalink"></a></h2><p>Some tools, like NSight Systems Compute, also make it possible to do source-level profiling. CUDA.jl will by default emit the necessary source line information, which you can disable by launching Julia with <code>-g0</code>. Conversely, launching with <code>-g2</code> will emit additional debug information, which can be useful in combination with tools like <code>cuda-gdb</code>, but might hurt performance or code size.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../../usage/multigpu/">« Multiple GPUs</a><a class="docs-footer-nextpage" href="../kernel/">Kernel programming »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Monday 26 August 2024 07:38">Monday 26 August 2024</span>. Using Julia version 1.10.4.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+end</code></pre><p>For more details, refer to the documentation of the NVTX.jl package.</p><h2 id="Compiler-options"><a class="docs-heading-anchor" href="#Compiler-options">Compiler options</a><a id="Compiler-options-1"></a><a class="docs-heading-anchor-permalink" href="#Compiler-options" title="Permalink"></a></h2><p>Some tools, like NSight Systems Compute, also make it possible to do source-level profiling. CUDA.jl will by default emit the necessary source line information, which you can disable by launching Julia with <code>-g0</code>. Conversely, launching with <code>-g2</code> will emit additional debug information, which can be useful in combination with tools like <code>cuda-gdb</code>, but might hurt performance or code size.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../../usage/multigpu/">« Multiple GPUs</a><a class="docs-footer-nextpage" href="../kernel/">Kernel programming »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Wednesday 4 September 2024 18:14">Wednesday 4 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/development/troubleshooting/index.html b/dev/development/troubleshooting/index.html
index c7953d0696..5dcdd7344f 100644
--- a/dev/development/troubleshooting/index.html
+++ b/dev/development/troubleshooting/index.html
@@ -45,4 +45,4 @@
  • %17  = call CUDA.sin(::Int64)::Union{}</code></pre><p>Both from the IR and the list of calls Cthulhu offers to inspect further, we can see that the call to <code>CUDA.sin(::Int64)</code> results in an error: in the IR it is immediately followed by an <code>unreachable</code>, while in the list of calls it is inferred to return <code>Union{}</code>. Now we know where to look, it&#39;s easy to figure out what&#39;s wrong:</p><pre><code class="language-julia hljs">help?&gt; CUDA.sin
   # 2 methods for generic function &quot;sin&quot;:
   [1] sin(x::Float32) in CUDA at /home/tim/Julia/pkg/CUDA/src/device/intrinsics/math.jl:13
-  [2] sin(x::Float64) in CUDA at /home/tim/Julia/pkg/CUDA/src/device/intrinsics/math.jl:12</code></pre><p>There&#39;s no method of <code>CUDA.sin</code> that accepts an Int64, and thus the function was determined to unconditionally throw a method error. For now, we disallow these situations and refuse to compile, but in the spirit of dynamic languages we might change this behavior to just throw an error at run time.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../kernel/">« Kernel programming</a><a class="docs-footer-nextpage" href="../debugging/">Debugging »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Monday 26 August 2024 07:38">Monday 26 August 2024</span>. Using Julia version 1.10.4.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+  [2] sin(x::Float64) in CUDA at /home/tim/Julia/pkg/CUDA/src/device/intrinsics/math.jl:12</code></pre><p>There&#39;s no method of <code>CUDA.sin</code> that accepts an Int64, and thus the function was determined to unconditionally throw a method error. For now, we disallow these situations and refuse to compile, but in the spirit of dynamic languages we might change this behavior to just throw an error at run time.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../kernel/">« Kernel programming</a><a class="docs-footer-nextpage" href="../debugging/">Debugging »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Wednesday 4 September 2024 18:14">Wednesday 4 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/faq/index.html b/dev/faq/index.html
index c2ff706f10..4215ab8173 100644
--- a/dev/faq/index.html
+++ b/dev/faq/index.html
@@ -20,4 +20,4 @@
      ├─possible versions are: [0.4.1, 0.5.0-0.5.4, 0.6.0-0.6.10, 0.7.0-0.7.3, 0.8.0-0.8.3, 0.9.0, 0.10.0-0.10.4, 0.11.0-0.11.1] or uninstalled
      ├─restricted to versions * by an explicit requirement, leaving only versions [0.4.1, 0.5.0-0.5.4, 0.6.0-0.6.10, 0.7.0-0.7.3, 0.8.0-0.8.3, 0.9.0, 0.10.0-0.10.4, 0.11.0-0.11.1]
      └─restricted by compatibility requirements with CUDA [052768ef] to versions: [0.4.1, 0.5.0-0.5.4, 0.6.0-0.6.10, 0.7.0-0.7.3, 0.8.0-0.8.3, 0.9.0, 0.10.0-0.10.4] or uninstalled, leaving only versions: [0.4.1, 0.5.0-0.5.4, 0.6.0-0.6.10, 0.7.0-0.7.3, 0.8.0-0.8.3, 0.9.0, 0.10.0-0.10.4]
-       └─CUDA [052768ef] log: see above</code></pre><p>A common source of these incompatibilities is having both CUDA.jl and the older CUDAnative.jl/CuArrays.jl/CUDAdrv.jl stack installed: These are incompatible, and cannot coexist. You can inspect in the Pkg REPL which exact packages you have installed using the <code>status --manifest</code> option.</p><h2 id="Can-you-wrap-this-or-that-CUDA-API?"><a class="docs-heading-anchor" href="#Can-you-wrap-this-or-that-CUDA-API?">Can you wrap this or that CUDA API?</a><a id="Can-you-wrap-this-or-that-CUDA-API?-1"></a><a class="docs-heading-anchor-permalink" href="#Can-you-wrap-this-or-that-CUDA-API?" title="Permalink"></a></h2><p>If a certain API isn&#39;t wrapped with some high-level functionality, you can always use the underlying C APIs which are always available as unexported methods. For example, you can access the CUDA driver library as <code>cu</code> prefixed, unexported functions like <code>CUDA.cuDriverGetVersion</code>. Similarly, vendor libraries like CUBLAS are available through their exported submodule handles, e.g., <code>CUBLAS.cublasGetVersion_v2</code>.</p><p>Any help on designing or implementing high-level wrappers for this low-level functionality is greatly appreciated, so please consider contributing your uses of these APIs on the respective repositories.</p><h2 id="When-installing-CUDA.jl-on-a-cluster,-why-does-Julia-stall-during-precompilation?"><a class="docs-heading-anchor" href="#When-installing-CUDA.jl-on-a-cluster,-why-does-Julia-stall-during-precompilation?">When installing CUDA.jl on a cluster, why does Julia stall during precompilation?</a><a id="When-installing-CUDA.jl-on-a-cluster,-why-does-Julia-stall-during-precompilation?-1"></a><a class="docs-heading-anchor-permalink" href="#When-installing-CUDA.jl-on-a-cluster,-why-does-Julia-stall-during-precompilation?" title="Permalink"></a></h2><p>If you&#39;re working on a cluster, precompilation may stall if you have not requested  sufficient memory. You may also wish to make sure you have enough disk space prior to installing CUDA.jl.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../lib/driver/">« CUDA driver</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Monday 26 August 2024 07:38">Monday 26 August 2024</span>. Using Julia version 1.10.4.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+       └─CUDA [052768ef] log: see above</code></pre><p>A common source of these incompatibilities is having both CUDA.jl and the older CUDAnative.jl/CuArrays.jl/CUDAdrv.jl stack installed: These are incompatible, and cannot coexist. You can inspect in the Pkg REPL which exact packages you have installed using the <code>status --manifest</code> option.</p><h2 id="Can-you-wrap-this-or-that-CUDA-API?"><a class="docs-heading-anchor" href="#Can-you-wrap-this-or-that-CUDA-API?">Can you wrap this or that CUDA API?</a><a id="Can-you-wrap-this-or-that-CUDA-API?-1"></a><a class="docs-heading-anchor-permalink" href="#Can-you-wrap-this-or-that-CUDA-API?" title="Permalink"></a></h2><p>If a certain API isn&#39;t wrapped with some high-level functionality, you can always use the underlying C APIs which are always available as unexported methods. For example, you can access the CUDA driver library as <code>cu</code> prefixed, unexported functions like <code>CUDA.cuDriverGetVersion</code>. Similarly, vendor libraries like CUBLAS are available through their exported submodule handles, e.g., <code>CUBLAS.cublasGetVersion_v2</code>.</p><p>Any help on designing or implementing high-level wrappers for this low-level functionality is greatly appreciated, so please consider contributing your uses of these APIs on the respective repositories.</p><h2 id="When-installing-CUDA.jl-on-a-cluster,-why-does-Julia-stall-during-precompilation?"><a class="docs-heading-anchor" href="#When-installing-CUDA.jl-on-a-cluster,-why-does-Julia-stall-during-precompilation?">When installing CUDA.jl on a cluster, why does Julia stall during precompilation?</a><a id="When-installing-CUDA.jl-on-a-cluster,-why-does-Julia-stall-during-precompilation?-1"></a><a class="docs-heading-anchor-permalink" href="#When-installing-CUDA.jl-on-a-cluster,-why-does-Julia-stall-during-precompilation?" title="Permalink"></a></h2><p>If you&#39;re working on a cluster, precompilation may stall if you have not requested  sufficient memory. You may also wish to make sure you have enough disk space prior to installing CUDA.jl.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../lib/driver/">« CUDA driver</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Wednesday 4 September 2024 18:14">Wednesday 4 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/index.html b/dev/index.html
index bb1a529425..c5aaf89fff 100644
--- a/dev/index.html
+++ b/dev/index.html
@@ -13,4 +13,4 @@
 Pkg.test(&quot;CUDA&quot;)
 
 # the test suite takes command-line options that allow customization; pass --help for details:
-#Pkg.test(&quot;CUDA&quot;; test_args=`--help`)</code></pre><p>For more details on the installation process, consult the <a href="installation/overview/#InstallationOverview">Installation</a> section. To understand the toolchain in more detail, have a look at the tutorials in this manual. <strong>It is highly recommended that new users start with the <a href="tutorials/introduction/#Introduction">Introduction</a> tutorial</strong>. For an overview of the available functionality, read the <a href="usage/overview/#UsageOverview">Usage</a> section. The following resources may also be of interest:</p><ul><li>Effectively using GPUs with Julia: <a href="https://docs.google.com/presentation/d/1l-BuAtyKgoVYakJSijaSqaTL3friESDyTOnU2OLqGoA/">slides</a></li><li>How Julia is compiled to GPUs: <a href="https://www.youtube.com/watch?v=Fz-ogmASMAE">video</a></li></ul><h2 id="Acknowledgements"><a class="docs-heading-anchor" href="#Acknowledgements">Acknowledgements</a><a id="Acknowledgements-1"></a><a class="docs-heading-anchor-permalink" href="#Acknowledgements" title="Permalink"></a></h2><p>The Julia CUDA stack has been a collaborative effort by many individuals. Significant contributions have been made by the following individuals:</p><ul><li>Tim Besard (@maleadt) (lead developer)</li><li>Valentin Churavy (@vchuravy)</li><li>Mike Innes (@MikeInnes)</li><li>Katharine Hyatt (@kshyatt)</li><li>Simon Danisch (@SimonDanisch)</li></ul><h2 id="Supporting-and-Citing"><a class="docs-heading-anchor" href="#Supporting-and-Citing">Supporting and Citing</a><a id="Supporting-and-Citing-1"></a><a class="docs-heading-anchor-permalink" href="#Supporting-and-Citing" title="Permalink"></a></h2><p>Much of the software in this ecosystem was developed as part of academic research. If you would like to help support it, please star the repository as such metrics may help us secure funding in the future. If you use our software as part of your research, teaching, or other activities, we would be grateful if you could cite our work. The <a href="https://github.com/JuliaGPU/CUDA.jl/blob/master/CITATION.bib">CITATION.bib</a> file in the root of this repository lists the relevant papers.</p></article><nav class="docs-footer"><a class="docs-footer-nextpage" href="tutorials/introduction/">Introduction »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Monday 26 August 2024 07:38">Monday 26 August 2024</span>. Using Julia version 1.10.4.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+#Pkg.test(&quot;CUDA&quot;; test_args=`--help`)</code></pre><p>For more details on the installation process, consult the <a href="installation/overview/#InstallationOverview">Installation</a> section. To understand the toolchain in more detail, have a look at the tutorials in this manual. <strong>It is highly recommended that new users start with the <a href="tutorials/introduction/#Introduction">Introduction</a> tutorial</strong>. For an overview of the available functionality, read the <a href="usage/overview/#UsageOverview">Usage</a> section. The following resources may also be of interest:</p><ul><li>Effectively using GPUs with Julia: <a href="https://docs.google.com/presentation/d/1l-BuAtyKgoVYakJSijaSqaTL3friESDyTOnU2OLqGoA/">slides</a></li><li>How Julia is compiled to GPUs: <a href="https://www.youtube.com/watch?v=Fz-ogmASMAE">video</a></li></ul><h2 id="Acknowledgements"><a class="docs-heading-anchor" href="#Acknowledgements">Acknowledgements</a><a id="Acknowledgements-1"></a><a class="docs-heading-anchor-permalink" href="#Acknowledgements" title="Permalink"></a></h2><p>The Julia CUDA stack has been a collaborative effort by many individuals. Significant contributions have been made by the following individuals:</p><ul><li>Tim Besard (@maleadt) (lead developer)</li><li>Valentin Churavy (@vchuravy)</li><li>Mike Innes (@MikeInnes)</li><li>Katharine Hyatt (@kshyatt)</li><li>Simon Danisch (@SimonDanisch)</li></ul><h2 id="Supporting-and-Citing"><a class="docs-heading-anchor" href="#Supporting-and-Citing">Supporting and Citing</a><a id="Supporting-and-Citing-1"></a><a class="docs-heading-anchor-permalink" href="#Supporting-and-Citing" title="Permalink"></a></h2><p>Much of the software in this ecosystem was developed as part of academic research. If you would like to help support it, please star the repository as such metrics may help us secure funding in the future. If you use our software as part of your research, teaching, or other activities, we would be grateful if you could cite our work. The <a href="https://github.com/JuliaGPU/CUDA.jl/blob/master/CITATION.bib">CITATION.bib</a> file in the root of this repository lists the relevant papers.</p></article><nav class="docs-footer"><a class="docs-footer-nextpage" href="tutorials/introduction/">Introduction »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Wednesday 4 September 2024 18:14">Wednesday 4 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/installation/conditional/index.html b/dev/installation/conditional/index.html
index bab61e2b10..553295f70a 100644
--- a/dev/installation/conditional/index.html
+++ b/dev/installation/conditional/index.html
@@ -33,4 +33,4 @@
 
 function __init__()
     use_gpu[] = CUDA.functional()
-end</code></pre><p>The disadvantage of this approach is the introduction of a type instability.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../overview/">« Overview</a><a class="docs-footer-nextpage" href="../troubleshooting/">Troubleshooting »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Monday 26 August 2024 07:38">Monday 26 August 2024</span>. Using Julia version 1.10.4.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+end</code></pre><p>The disadvantage of this approach is the introduction of a type instability.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../overview/">« Overview</a><a class="docs-footer-nextpage" href="../troubleshooting/">Troubleshooting »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Wednesday 4 September 2024 18:14">Wednesday 4 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/installation/overview/index.html b/dev/installation/overview/index.html
index b83d61bc18..b4f6504cd7 100644
--- a/dev/installation/overview/index.html
+++ b/dev/installation/overview/index.html
@@ -27,4 +27,4 @@
 julia&gt; CUDA.versioninfo()
 CUDA runtime 11.8, local installation
 ...</code></pre><p>Calling the above helper function generates the following <code>LocalPreferences.toml</code> file in your active environment:</p><pre><code class="nohighlight hljs">[CUDA_Runtime_jll]
-local = &quot;true&quot;</code></pre><p>This preference not only configures CUDA.jl to use a local toolkit, it also prevents downloading any artifact, so it may be interesting to set this preference before ever importing CUDA.jl (e.g., by putting this preference file in a system-wide depot).</p><p>If CUDA.jl doesn&#39;t properly detect your local toolkit, it may be that certain libraries or binaries aren&#39;t on a globally-discoverable path. For more information, run Julia with the <code>JULIA_DEBUG</code> environment variable set to <code>CUDA_Runtime_Discovery</code>.</p><p>Note that using a local toolkit instead of artifacts <em>any</em> CUDA-related JLL, not just of <code>CUDA_Runtime_jll</code>. Any package that depends on such a JLL needs to inspect <code>CUDA.local_toolkit</code>, and if set use <code>CUDA_Runtime_Discovery</code> to detect libraries and binaries instead.</p><h2 id="Precompiling-CUDA.jl-without-CUDA"><a class="docs-heading-anchor" href="#Precompiling-CUDA.jl-without-CUDA">Precompiling CUDA.jl without CUDA</a><a id="Precompiling-CUDA.jl-without-CUDA-1"></a><a class="docs-heading-anchor-permalink" href="#Precompiling-CUDA.jl-without-CUDA" title="Permalink"></a></h2><p>CUDA.jl can be precompiled and imported on systems without a GPU or CUDA installation. This simplifies the situation where an application optionally uses CUDA. However, when CUDA.jl is precompiled in such an environment, it <em>cannot</em> be used to run GPU code. This is a result of artifacts being selected at precompile time.</p><p>In some cases, e.g. with containers or HPC log-in nodes, you may want to precompile CUDA.jl on a system without CUDA, yet still want to have it download the necessary artifacts and/or produce a precompilation image that can be used on a system with CUDA. This can be achieved by informing CUDA.jl which CUDA toolkit to run time by calling <code>CUDA.set_runtime_version!</code>.</p><p>When using artifacts, that&#39;s as simple as e.g. calling <code>CUDA.set_runtime_version!(v&quot;11.8&quot;)</code>, and afterwards re-starting Julia and re-importing CUDA.jl in order to trigger precompilation again and download the necessary artifacts. If you want to use a local CUDA installation, you also need to set the <code>local_toolkit</code> keyword argument, e.g., by calling <code>CUDA.set_runtime_version!(v&quot;11.8&quot;; local_toolkit=true)</code>. Note that the version specified here needs to match what will be available at run time. In both cases, i.e. when using artifacts or a local toolkit, the chosen version needs to be compatible with the available driver.</p><p>Finally, in such a scenario you may also want to call <code>CUDA.precompile_runtime()</code> to ensure that the GPUCompiler runtime library is precompiled as well. This and all of the above is demonstrated in the Dockerfile that&#39;s part of the CUDA.jl repository.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../../tutorials/performance/">« Performance Tips</a><a class="docs-footer-nextpage" href="../conditional/">Conditional use »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Monday 26 August 2024 07:38">Monday 26 August 2024</span>. Using Julia version 1.10.4.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+local = &quot;true&quot;</code></pre><p>This preference not only configures CUDA.jl to use a local toolkit, it also prevents downloading any artifact, so it may be interesting to set this preference before ever importing CUDA.jl (e.g., by putting this preference file in a system-wide depot).</p><p>If CUDA.jl doesn&#39;t properly detect your local toolkit, it may be that certain libraries or binaries aren&#39;t on a globally-discoverable path. For more information, run Julia with the <code>JULIA_DEBUG</code> environment variable set to <code>CUDA_Runtime_Discovery</code>.</p><p>Note that using a local toolkit instead of artifacts <em>any</em> CUDA-related JLL, not just of <code>CUDA_Runtime_jll</code>. Any package that depends on such a JLL needs to inspect <code>CUDA.local_toolkit</code>, and if set use <code>CUDA_Runtime_Discovery</code> to detect libraries and binaries instead.</p><h2 id="Precompiling-CUDA.jl-without-CUDA"><a class="docs-heading-anchor" href="#Precompiling-CUDA.jl-without-CUDA">Precompiling CUDA.jl without CUDA</a><a id="Precompiling-CUDA.jl-without-CUDA-1"></a><a class="docs-heading-anchor-permalink" href="#Precompiling-CUDA.jl-without-CUDA" title="Permalink"></a></h2><p>CUDA.jl can be precompiled and imported on systems without a GPU or CUDA installation. This simplifies the situation where an application optionally uses CUDA. However, when CUDA.jl is precompiled in such an environment, it <em>cannot</em> be used to run GPU code. This is a result of artifacts being selected at precompile time.</p><p>In some cases, e.g. with containers or HPC log-in nodes, you may want to precompile CUDA.jl on a system without CUDA, yet still want to have it download the necessary artifacts and/or produce a precompilation image that can be used on a system with CUDA. This can be achieved by informing CUDA.jl which CUDA toolkit to run time by calling <code>CUDA.set_runtime_version!</code>.</p><p>When using artifacts, that&#39;s as simple as e.g. calling <code>CUDA.set_runtime_version!(v&quot;11.8&quot;)</code>, and afterwards re-starting Julia and re-importing CUDA.jl in order to trigger precompilation again and download the necessary artifacts. If you want to use a local CUDA installation, you also need to set the <code>local_toolkit</code> keyword argument, e.g., by calling <code>CUDA.set_runtime_version!(v&quot;11.8&quot;; local_toolkit=true)</code>. Note that the version specified here needs to match what will be available at run time. In both cases, i.e. when using artifacts or a local toolkit, the chosen version needs to be compatible with the available driver.</p><p>Finally, in such a scenario you may also want to call <code>CUDA.precompile_runtime()</code> to ensure that the GPUCompiler runtime library is precompiled as well. This and all of the above is demonstrated in the Dockerfile that&#39;s part of the CUDA.jl repository.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../../tutorials/performance/">« Performance Tips</a><a class="docs-footer-nextpage" href="../conditional/">Conditional use »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Wednesday 4 September 2024 18:14">Wednesday 4 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/installation/troubleshooting/index.html b/dev/installation/troubleshooting/index.html
index 4a7fff20cc..6107664c91 100644
--- a/dev/installation/troubleshooting/index.html
+++ b/dev/installation/troubleshooting/index.html
@@ -3,4 +3,4 @@
   function gtag(){dataLayer.push(arguments);}
   gtag('js', new Date());
   gtag('config', 'UA-154489943-2', {'page_path': location.pathname + location.search + location.hash});
-</script><script data-outdated-warner src="../../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL="../.."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../../assets/documenter.js"></script><script src="../../search_index.js"></script><script src="../../siteinfo.js"></script><script src="../../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../../assets/themeswap.js"></script><link href="../../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../../"><img src="../../assets/logo.png" alt="CUDA.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../../">CUDA.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../../">Home</a></li><li><span class="tocitem">Tutorials</span><ul><li><a class="tocitem" href="../../tutorials/introduction/">Introduction</a></li><li><a class="tocitem" href="../../tutorials/custom_structs/">Using custom structs</a></li><li><a class="tocitem" href="../../tutorials/performance/">Performance Tips</a></li></ul></li><li><span class="tocitem">Installation</span><ul><li><a class="tocitem" href="../overview/">Overview</a></li><li><a class="tocitem" href="../conditional/">Conditional use</a></li><li class="is-active"><a class="tocitem" href>Troubleshooting</a><ul class="internal"><li><a class="tocitem" href="#UndefVarError:-libcuda-not-defined"><span>UndefVarError: libcuda not defined</span></a></li><li><a class="tocitem" href="#UNKNOWN_ERROR(999)"><span>UNKNOWN_ERROR(999)</span></a></li><li><a class="tocitem" href="#NVML-library-not-found-(on-Windows)"><span>NVML library not found (on Windows)</span></a></li><li><a class="tocitem" href="#The-specified-module-could-not-be-found-(on-Windows)"><span>The specified module could not be found (on Windows)</span></a></li></ul></li></ul></li><li><span class="tocitem">Usage</span><ul><li><a class="tocitem" href="../../usage/overview/">Overview</a></li><li><a class="tocitem" href="../../usage/workflow/">Workflow</a></li><li><a class="tocitem" href="../../usage/array/">Array programming</a></li><li><a class="tocitem" href="../../usage/memory/">Memory management</a></li><li><a class="tocitem" href="../../usage/multitasking/">Tasks and threads</a></li><li><a class="tocitem" href="../../usage/multigpu/">Multiple GPUs</a></li></ul></li><li><span class="tocitem">Development</span><ul><li><a class="tocitem" href="../../development/profiling/">Benchmarking &amp; profiling</a></li><li><a class="tocitem" href="../../development/kernel/">Kernel programming</a></li><li><a class="tocitem" href="../../development/troubleshooting/">Troubleshooting</a></li><li><a class="tocitem" href="../../development/debugging/">Debugging</a></li></ul></li><li><span class="tocitem">API reference</span><ul><li><a class="tocitem" href="../../api/essentials/">Essentials</a></li><li><a class="tocitem" href="../../api/array/">Array programming</a></li><li><a class="tocitem" href="../../api/kernel/">Kernel programming</a></li><li><a class="tocitem" href="../../api/compiler/">Compiler</a></li></ul></li><li><span class="tocitem">Library reference</span><ul><li><a class="tocitem" href="../../lib/driver/">CUDA driver</a></li></ul></li><li><a class="tocitem" href="../../faq/">FAQ</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li><a class="is-disabled">Installation</a></li><li class="is-active"><a href>Troubleshooting</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Troubleshooting</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/CUDA.jl/blob/master/docs/src/installation/troubleshooting.md#" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Troubleshooting"><a class="docs-heading-anchor" href="#Troubleshooting">Troubleshooting</a><a id="Troubleshooting-1"></a><a class="docs-heading-anchor-permalink" href="#Troubleshooting" title="Permalink"></a></h1><h2 id="UndefVarError:-libcuda-not-defined"><a class="docs-heading-anchor" href="#UndefVarError:-libcuda-not-defined">UndefVarError: libcuda not defined</a><a id="UndefVarError:-libcuda-not-defined-1"></a><a class="docs-heading-anchor-permalink" href="#UndefVarError:-libcuda-not-defined" title="Permalink"></a></h2><p>This means that CUDA.jl could not find a suitable CUDA driver. For more information, re-run with the <code>JULIA_DEBUG</code> environment variable set to <code>CUDA_Driver_jll</code>.</p><h2 id="UNKNOWN_ERROR(999)"><a class="docs-heading-anchor" href="#UNKNOWN_ERROR(999)">UNKNOWN_ERROR(999)</a><a id="UNKNOWN_ERROR(999)-1"></a><a class="docs-heading-anchor-permalink" href="#UNKNOWN_ERROR(999)" title="Permalink"></a></h2><p>If you encounter this error, there are several known issues that may be causing it:</p><ul><li>a mismatch between the CUDA driver and driver library: on Linux, look for clues in <code>dmesg</code></li><li>the CUDA driver is in a bad state: this can happen after resume. <strong>Try rebooting</strong>.</li></ul><p>Generally though, it&#39;s impossible to say what&#39;s the reason for the error, but Julia is likely not to blame. Make sure your set-up works (e.g., try executing <code>nvidia-smi</code>, a CUDA C binary, etc), and if everything looks good file an issue.</p><h2 id="NVML-library-not-found-(on-Windows)"><a class="docs-heading-anchor" href="#NVML-library-not-found-(on-Windows)">NVML library not found (on Windows)</a><a id="NVML-library-not-found-(on-Windows)-1"></a><a class="docs-heading-anchor-permalink" href="#NVML-library-not-found-(on-Windows)" title="Permalink"></a></h2><p>Check and make sure the <code>NVSMI</code> folder is in your <code>PATH</code>. By default it may not be. Look in <code>C:\Program Files\NVIDIA Corporation</code> for the <code>NVSMI</code> folder - you should see <code>nvml.dll</code> within it. You can add this folder to your <code>PATH</code> and check that <code>nvidia-smi</code> runs properly.</p><h2 id="The-specified-module-could-not-be-found-(on-Windows)"><a class="docs-heading-anchor" href="#The-specified-module-could-not-be-found-(on-Windows)">The specified module could not be found (on Windows)</a><a id="The-specified-module-could-not-be-found-(on-Windows)-1"></a><a class="docs-heading-anchor-permalink" href="#The-specified-module-could-not-be-found-(on-Windows)" title="Permalink"></a></h2><p>Ensure the <a href="https://aka.ms/vs/16/release/vc_redist.x64.exe">Visual C++ Redistributable</a> is installed.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../conditional/">« Conditional use</a><a class="docs-footer-nextpage" href="../../usage/overview/">Overview »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Monday 26 August 2024 07:38">Monday 26 August 2024</span>. Using Julia version 1.10.4.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+</script><script data-outdated-warner src="../../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL="../.."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../../assets/documenter.js"></script><script src="../../search_index.js"></script><script src="../../siteinfo.js"></script><script src="../../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../../assets/themeswap.js"></script><link href="../../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../../"><img src="../../assets/logo.png" alt="CUDA.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../../">CUDA.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../../">Home</a></li><li><span class="tocitem">Tutorials</span><ul><li><a class="tocitem" href="../../tutorials/introduction/">Introduction</a></li><li><a class="tocitem" href="../../tutorials/custom_structs/">Using custom structs</a></li><li><a class="tocitem" href="../../tutorials/performance/">Performance Tips</a></li></ul></li><li><span class="tocitem">Installation</span><ul><li><a class="tocitem" href="../overview/">Overview</a></li><li><a class="tocitem" href="../conditional/">Conditional use</a></li><li class="is-active"><a class="tocitem" href>Troubleshooting</a><ul class="internal"><li><a class="tocitem" href="#UndefVarError:-libcuda-not-defined"><span>UndefVarError: libcuda not defined</span></a></li><li><a class="tocitem" href="#UNKNOWN_ERROR(999)"><span>UNKNOWN_ERROR(999)</span></a></li><li><a class="tocitem" href="#NVML-library-not-found-(on-Windows)"><span>NVML library not found (on Windows)</span></a></li><li><a class="tocitem" href="#The-specified-module-could-not-be-found-(on-Windows)"><span>The specified module could not be found (on Windows)</span></a></li></ul></li></ul></li><li><span class="tocitem">Usage</span><ul><li><a class="tocitem" href="../../usage/overview/">Overview</a></li><li><a class="tocitem" href="../../usage/workflow/">Workflow</a></li><li><a class="tocitem" href="../../usage/array/">Array programming</a></li><li><a class="tocitem" href="../../usage/memory/">Memory management</a></li><li><a class="tocitem" href="../../usage/multitasking/">Tasks and threads</a></li><li><a class="tocitem" href="../../usage/multigpu/">Multiple GPUs</a></li></ul></li><li><span class="tocitem">Development</span><ul><li><a class="tocitem" href="../../development/profiling/">Benchmarking &amp; profiling</a></li><li><a class="tocitem" href="../../development/kernel/">Kernel programming</a></li><li><a class="tocitem" href="../../development/troubleshooting/">Troubleshooting</a></li><li><a class="tocitem" href="../../development/debugging/">Debugging</a></li></ul></li><li><span class="tocitem">API reference</span><ul><li><a class="tocitem" href="../../api/essentials/">Essentials</a></li><li><a class="tocitem" href="../../api/array/">Array programming</a></li><li><a class="tocitem" href="../../api/kernel/">Kernel programming</a></li><li><a class="tocitem" href="../../api/compiler/">Compiler</a></li></ul></li><li><span class="tocitem">Library reference</span><ul><li><a class="tocitem" href="../../lib/driver/">CUDA driver</a></li></ul></li><li><a class="tocitem" href="../../faq/">FAQ</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li><a class="is-disabled">Installation</a></li><li class="is-active"><a href>Troubleshooting</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Troubleshooting</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/CUDA.jl/blob/master/docs/src/installation/troubleshooting.md#" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Troubleshooting"><a class="docs-heading-anchor" href="#Troubleshooting">Troubleshooting</a><a id="Troubleshooting-1"></a><a class="docs-heading-anchor-permalink" href="#Troubleshooting" title="Permalink"></a></h1><h2 id="UndefVarError:-libcuda-not-defined"><a class="docs-heading-anchor" href="#UndefVarError:-libcuda-not-defined">UndefVarError: libcuda not defined</a><a id="UndefVarError:-libcuda-not-defined-1"></a><a class="docs-heading-anchor-permalink" href="#UndefVarError:-libcuda-not-defined" title="Permalink"></a></h2><p>This means that CUDA.jl could not find a suitable CUDA driver. For more information, re-run with the <code>JULIA_DEBUG</code> environment variable set to <code>CUDA_Driver_jll</code>.</p><h2 id="UNKNOWN_ERROR(999)"><a class="docs-heading-anchor" href="#UNKNOWN_ERROR(999)">UNKNOWN_ERROR(999)</a><a id="UNKNOWN_ERROR(999)-1"></a><a class="docs-heading-anchor-permalink" href="#UNKNOWN_ERROR(999)" title="Permalink"></a></h2><p>If you encounter this error, there are several known issues that may be causing it:</p><ul><li>a mismatch between the CUDA driver and driver library: on Linux, look for clues in <code>dmesg</code></li><li>the CUDA driver is in a bad state: this can happen after resume. <strong>Try rebooting</strong>.</li></ul><p>Generally though, it&#39;s impossible to say what&#39;s the reason for the error, but Julia is likely not to blame. Make sure your set-up works (e.g., try executing <code>nvidia-smi</code>, a CUDA C binary, etc), and if everything looks good file an issue.</p><h2 id="NVML-library-not-found-(on-Windows)"><a class="docs-heading-anchor" href="#NVML-library-not-found-(on-Windows)">NVML library not found (on Windows)</a><a id="NVML-library-not-found-(on-Windows)-1"></a><a class="docs-heading-anchor-permalink" href="#NVML-library-not-found-(on-Windows)" title="Permalink"></a></h2><p>Check and make sure the <code>NVSMI</code> folder is in your <code>PATH</code>. By default it may not be. Look in <code>C:\Program Files\NVIDIA Corporation</code> for the <code>NVSMI</code> folder - you should see <code>nvml.dll</code> within it. You can add this folder to your <code>PATH</code> and check that <code>nvidia-smi</code> runs properly.</p><h2 id="The-specified-module-could-not-be-found-(on-Windows)"><a class="docs-heading-anchor" href="#The-specified-module-could-not-be-found-(on-Windows)">The specified module could not be found (on Windows)</a><a id="The-specified-module-could-not-be-found-(on-Windows)-1"></a><a class="docs-heading-anchor-permalink" href="#The-specified-module-could-not-be-found-(on-Windows)" title="Permalink"></a></h2><p>Ensure the <a href="https://aka.ms/vs/16/release/vc_redist.x64.exe">Visual C++ Redistributable</a> is installed.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../conditional/">« Conditional use</a><a class="docs-footer-nextpage" href="../../usage/overview/">Overview »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Wednesday 4 September 2024 18:14">Wednesday 4 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/lib/driver/index.html b/dev/lib/driver/index.html
index d38493cf26..324d4ceea0 100644
--- a/dev/lib/driver/index.html
+++ b/dev/lib/driver/index.html
@@ -3,24 +3,24 @@
   function gtag(){dataLayer.push(arguments);}
   gtag('js', new Date());
   gtag('config', 'UA-154489943-2', {'page_path': location.pathname + location.search + location.hash});
-</script><script data-outdated-warner src="../../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL="../.."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../../assets/documenter.js"></script><script src="../../search_index.js"></script><script src="../../siteinfo.js"></script><script src="../../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../../assets/themeswap.js"></script><link href="../../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../../"><img src="../../assets/logo.png" alt="CUDA.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../../">CUDA.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../../">Home</a></li><li><span class="tocitem">Tutorials</span><ul><li><a class="tocitem" href="../../tutorials/introduction/">Introduction</a></li><li><a class="tocitem" href="../../tutorials/custom_structs/">Using custom structs</a></li><li><a class="tocitem" href="../../tutorials/performance/">Performance Tips</a></li></ul></li><li><span class="tocitem">Installation</span><ul><li><a class="tocitem" href="../../installation/overview/">Overview</a></li><li><a class="tocitem" href="../../installation/conditional/">Conditional use</a></li><li><a class="tocitem" href="../../installation/troubleshooting/">Troubleshooting</a></li></ul></li><li><span class="tocitem">Usage</span><ul><li><a class="tocitem" href="../../usage/overview/">Overview</a></li><li><a class="tocitem" href="../../usage/workflow/">Workflow</a></li><li><a class="tocitem" href="../../usage/array/">Array programming</a></li><li><a class="tocitem" href="../../usage/memory/">Memory management</a></li><li><a class="tocitem" href="../../usage/multitasking/">Tasks and threads</a></li><li><a class="tocitem" href="../../usage/multigpu/">Multiple GPUs</a></li></ul></li><li><span class="tocitem">Development</span><ul><li><a class="tocitem" href="../../development/profiling/">Benchmarking &amp; profiling</a></li><li><a class="tocitem" href="../../development/kernel/">Kernel programming</a></li><li><a class="tocitem" href="../../development/troubleshooting/">Troubleshooting</a></li><li><a class="tocitem" href="../../development/debugging/">Debugging</a></li></ul></li><li><span class="tocitem">API reference</span><ul><li><a class="tocitem" href="../../api/essentials/">Essentials</a></li><li><a class="tocitem" href="../../api/array/">Array programming</a></li><li><a class="tocitem" href="../../api/kernel/">Kernel programming</a></li><li><a class="tocitem" href="../../api/compiler/">Compiler</a></li></ul></li><li><span class="tocitem">Library reference</span><ul><li class="is-active"><a class="tocitem" href>CUDA driver</a><ul class="internal"><li><a class="tocitem" href="#Error-Handling"><span>Error Handling</span></a></li><li><a class="tocitem" href="#Version-Management"><span>Version Management</span></a></li><li><a class="tocitem" href="#Device-Management"><span>Device Management</span></a></li><li><a class="tocitem" href="#Context-Management"><span>Context Management</span></a></li><li><a class="tocitem" href="#Module-Management"><span>Module Management</span></a></li><li><a class="tocitem" href="#Memory-Management"><span>Memory Management</span></a></li><li><a class="tocitem" href="#Stream-Management"><span>Stream Management</span></a></li><li><a class="tocitem" href="#Event-Management"><span>Event Management</span></a></li><li><a class="tocitem" href="#Execution-Control"><span>Execution Control</span></a></li><li><a class="tocitem" href="#Profiler-Control"><span>Profiler Control</span></a></li><li><a class="tocitem" href="#Texture-Memory"><span>Texture Memory</span></a></li><li><a class="tocitem" href="#Occupancy-API"><span>Occupancy API</span></a></li><li><a class="tocitem" href="#Graph-Execution"><span>Graph Execution</span></a></li></ul></li></ul></li><li><a class="tocitem" href="../../faq/">FAQ</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li><a class="is-disabled">Library reference</a></li><li class="is-active"><a href>CUDA driver</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>CUDA driver</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/CUDA.jl/blob/master/docs/src/lib/driver.md#" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="CUDA-driver"><a class="docs-heading-anchor" href="#CUDA-driver">CUDA driver</a><a id="CUDA-driver-1"></a><a class="docs-heading-anchor-permalink" href="#CUDA-driver" title="Permalink"></a></h1><p>This section lists the package&#39;s public functionality that directly corresponds to functionality of the CUDA driver API. In general, the abstractions stay close to those of the CUDA driver API, so for more information on certain library calls you can consult the <a href="http://docs.nvidia.com/cuda/cuda-driver-api/">CUDA driver API reference</a>.</p><p>The documentation is grouped according to the modules of the driver API.</p><h2 id="Error-Handling"><a class="docs-heading-anchor" href="#Error-Handling">Error Handling</a><a id="Error-Handling-1"></a><a class="docs-heading-anchor-permalink" href="#Error-Handling" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuError" href="#CUDA.CuError"><code>CUDA.CuError</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuError(code)</code></pre><p>Create a CUDA error object with error code <code>code</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/error.jl#L26-L30">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.name-Tuple{CuError}" href="#CUDA.name-Tuple{CuError}"><code>CUDA.name</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">name(err::CuError)</code></pre><p>Gets the string representation of an error code.</p><pre><code class="language-julia-repl hljs">julia&gt; err = CuError(CUDA.cudaError_enum(1))
+</script><script data-outdated-warner src="../../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL="../.."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../../assets/documenter.js"></script><script src="../../search_index.js"></script><script src="../../siteinfo.js"></script><script src="../../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../../assets/themeswap.js"></script><link href="../../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../../"><img src="../../assets/logo.png" alt="CUDA.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../../">CUDA.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../../">Home</a></li><li><span class="tocitem">Tutorials</span><ul><li><a class="tocitem" href="../../tutorials/introduction/">Introduction</a></li><li><a class="tocitem" href="../../tutorials/custom_structs/">Using custom structs</a></li><li><a class="tocitem" href="../../tutorials/performance/">Performance Tips</a></li></ul></li><li><span class="tocitem">Installation</span><ul><li><a class="tocitem" href="../../installation/overview/">Overview</a></li><li><a class="tocitem" href="../../installation/conditional/">Conditional use</a></li><li><a class="tocitem" href="../../installation/troubleshooting/">Troubleshooting</a></li></ul></li><li><span class="tocitem">Usage</span><ul><li><a class="tocitem" href="../../usage/overview/">Overview</a></li><li><a class="tocitem" href="../../usage/workflow/">Workflow</a></li><li><a class="tocitem" href="../../usage/array/">Array programming</a></li><li><a class="tocitem" href="../../usage/memory/">Memory management</a></li><li><a class="tocitem" href="../../usage/multitasking/">Tasks and threads</a></li><li><a class="tocitem" href="../../usage/multigpu/">Multiple GPUs</a></li></ul></li><li><span class="tocitem">Development</span><ul><li><a class="tocitem" href="../../development/profiling/">Benchmarking &amp; profiling</a></li><li><a class="tocitem" href="../../development/kernel/">Kernel programming</a></li><li><a class="tocitem" href="../../development/troubleshooting/">Troubleshooting</a></li><li><a class="tocitem" href="../../development/debugging/">Debugging</a></li></ul></li><li><span class="tocitem">API reference</span><ul><li><a class="tocitem" href="../../api/essentials/">Essentials</a></li><li><a class="tocitem" href="../../api/array/">Array programming</a></li><li><a class="tocitem" href="../../api/kernel/">Kernel programming</a></li><li><a class="tocitem" href="../../api/compiler/">Compiler</a></li></ul></li><li><span class="tocitem">Library reference</span><ul><li class="is-active"><a class="tocitem" href>CUDA driver</a><ul class="internal"><li><a class="tocitem" href="#Error-Handling"><span>Error Handling</span></a></li><li><a class="tocitem" href="#Version-Management"><span>Version Management</span></a></li><li><a class="tocitem" href="#Device-Management"><span>Device Management</span></a></li><li><a class="tocitem" href="#Context-Management"><span>Context Management</span></a></li><li><a class="tocitem" href="#Module-Management"><span>Module Management</span></a></li><li><a class="tocitem" href="#Memory-Management"><span>Memory Management</span></a></li><li><a class="tocitem" href="#Stream-Management"><span>Stream Management</span></a></li><li><a class="tocitem" href="#Event-Management"><span>Event Management</span></a></li><li><a class="tocitem" href="#Execution-Control"><span>Execution Control</span></a></li><li><a class="tocitem" href="#Profiler-Control"><span>Profiler Control</span></a></li><li><a class="tocitem" href="#Texture-Memory"><span>Texture Memory</span></a></li><li><a class="tocitem" href="#Occupancy-API"><span>Occupancy API</span></a></li><li><a class="tocitem" href="#Graph-Execution"><span>Graph Execution</span></a></li></ul></li></ul></li><li><a class="tocitem" href="../../faq/">FAQ</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li><a class="is-disabled">Library reference</a></li><li class="is-active"><a href>CUDA driver</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>CUDA driver</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/CUDA.jl/blob/master/docs/src/lib/driver.md#" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="CUDA-driver"><a class="docs-heading-anchor" href="#CUDA-driver">CUDA driver</a><a id="CUDA-driver-1"></a><a class="docs-heading-anchor-permalink" href="#CUDA-driver" title="Permalink"></a></h1><p>This section lists the package&#39;s public functionality that directly corresponds to functionality of the CUDA driver API. In general, the abstractions stay close to those of the CUDA driver API, so for more information on certain library calls you can consult the <a href="http://docs.nvidia.com/cuda/cuda-driver-api/">CUDA driver API reference</a>.</p><p>The documentation is grouped according to the modules of the driver API.</p><h2 id="Error-Handling"><a class="docs-heading-anchor" href="#Error-Handling">Error Handling</a><a id="Error-Handling-1"></a><a class="docs-heading-anchor-permalink" href="#Error-Handling" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuError" href="#CUDA.CuError"><code>CUDA.CuError</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuError(code)</code></pre><p>Create a CUDA error object with error code <code>code</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/error.jl#L26-L30">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.name-Tuple{CuError}" href="#CUDA.name-Tuple{CuError}"><code>CUDA.name</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">name(err::CuError)</code></pre><p>Gets the string representation of an error code.</p><pre><code class="language-julia-repl hljs">julia&gt; err = CuError(CUDA.cudaError_enum(1))
 CuError(CUDA_ERROR_INVALID_VALUE)
 
 julia&gt; name(err)
-&quot;ERROR_INVALID_VALUE&quot;</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/error.jl#L39-L51">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.description-Tuple{CuError}" href="#CUDA.description-Tuple{CuError}"><code>CUDA.description</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">description(err::CuError)</code></pre><p>Gets the string description of an error code.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/error.jl#L58-L62">source</a></section></article><h2 id="Version-Management"><a class="docs-heading-anchor" href="#Version-Management">Version Management</a><a id="Version-Management-1"></a><a class="docs-heading-anchor-permalink" href="#Version-Management" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.driver_version-Tuple{}" href="#CUDA.driver_version-Tuple{}"><code>CUDA.driver_version</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">driver_version()</code></pre><p>Returns the latest version of CUDA supported by the loaded driver.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/version.jl#L14-L18">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.runtime_version-Tuple{}" href="#CUDA.runtime_version-Tuple{}"><code>CUDA.runtime_version</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">runtime_version()</code></pre><p>Returns the CUDA Runtime version.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/version.jl#L24-L28">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.set_runtime_version!" href="#CUDA.set_runtime_version!"><code>CUDA.set_runtime_version!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">CUDA.set_runtime_version!([version::VersionNumber]; [local_toolkit::Bool])</code></pre><p>Configures the active project to use a specific CUDA toolkit version from a specific source.</p><p>If <code>local_toolkit</code> is set, the CUDA toolkit will be used from the local system, otherwise it will be downloaded from an artifact source. In the case of a local toolkit, <code>version</code> informs CUDA.jl which version that is (this may be useful if auto-detection fails). In the case of artifact sources, <code>version</code> controls which version will be downloaded and used.</p><p>When not specifying either the <code>version</code> or the <code>local_toolkit</code> argument, the default behavior will be used, which is to use the most recent compatible runtime available from an artifact source. Note that this will override any Preferences that may be configured in a higher-up depot; to clear preferences nondestructively, use <a href="#CUDA.reset_runtime_version!"><code>CUDA.reset_runtime_version!</code></a> instead.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/version.jl#L39-L54">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.reset_runtime_version!" href="#CUDA.reset_runtime_version!"><code>CUDA.reset_runtime_version!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">CUDA.reset_runtime_version!()</code></pre><p>Resets the CUDA version preferences in the active project to the default, which is to use the most recent compatible runtime available from an artifact source, unless a higher-up depot has configured a different preference. To force use of the default behavior for the local project, use <a href="#CUDA.set_runtime_version!"><code>CUDA.set_runtime_version!</code></a> with no arguments.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/version.jl#L79-L86">source</a></section></article><h2 id="Device-Management"><a class="docs-heading-anchor" href="#Device-Management">Device Management</a><a id="Device-Management-1"></a><a class="docs-heading-anchor-permalink" href="#Device-Management" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuDevice" href="#CUDA.CuDevice"><code>CUDA.CuDevice</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuDevice(ordinal::Integer)</code></pre><p>Get a handle to a compute device.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/devices.jl#L7-L11">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.devices" href="#CUDA.devices"><code>CUDA.devices</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">devices()</code></pre><p>Get an iterator for the compute devices.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/devices.jl#L121-L125">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.current_device" href="#CUDA.current_device"><code>CUDA.current_device</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">current_device()</code></pre><p>Returns the current device.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>This is a low-level API, returning the current device as known to the CUDA driver. For most users, it is recommended to use the <a href="../../api/essentials/#CUDA.device"><code>device</code></a> method instead.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/devices.jl#L33-L42">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.name-Tuple{CuDevice}" href="#CUDA.name-Tuple{CuDevice}"><code>CUDA.name</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">name(dev::CuDevice)</code></pre><p>Returns an identifier string for the device.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/devices.jl#L61-L65">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.totalmem-Tuple{CuDevice}" href="#CUDA.totalmem-Tuple{CuDevice}"><code>CUDA.totalmem</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">totalmem(dev::CuDevice)</code></pre><p>Returns the total amount of memory (in bytes) on the device.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/devices.jl#L97-L101">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.attribute" href="#CUDA.attribute"><code>CUDA.attribute</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">attribute(dev::CuDevice, code)</code></pre><p>Returns information about the device.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/devices.jl#L159-L163">source</a></section><section><div><pre><code class="language-julia hljs">attribute(X, pool::CuMemoryPool, attr)</code></pre><p>Returns attribute <code>attr</code> about <code>pool</code>. The type of the returned value depends on the attribute, and as such must be passed as the <code>X</code> parameter.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/pool.jl#L76-L81">source</a></section><section><div><pre><code class="language-julia hljs">attribute(X, ptr::Union{Ptr,CuPtr}, attr)</code></pre><p>Returns attribute <code>attr</code> about pointer <code>ptr</code>. The type of the returned value depends on the attribute, and as such must be passed as the <code>X</code> parameter.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/memory.jl#L766-L771">source</a></section></article><p>Certain common attributes are exposed by additional convenience functions:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.capability-Tuple{CuDevice}" href="#CUDA.capability-Tuple{CuDevice}"><code>CUDA.capability</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">capability(dev::CuDevice)</code></pre><p>Returns the compute capability of the device.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/devices.jl#L179-L183">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.warpsize-Tuple{CuDevice}" href="#CUDA.warpsize-Tuple{CuDevice}"><code>CUDA.warpsize</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">warpsize(dev::CuDevice)</code></pre><p>Returns the warp size (in threads) of the device.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/devices.jl#L172-L176">source</a></section></article><h2 id="Context-Management"><a class="docs-heading-anchor" href="#Context-Management">Context Management</a><a id="Context-Management-1"></a><a class="docs-heading-anchor-permalink" href="#Context-Management" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuContext" href="#CUDA.CuContext"><code>CUDA.CuContext</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuContext(dev::CuDevice, flags=CTX_SCHED_AUTO)
-CuContext(f::Function, ...)</code></pre><p>Create a CUDA context for device. A context on the GPU is analogous to a process on the CPU, with its own distinct address space and allocated resources. When a context is destroyed, the system cleans up the resources allocated to it.</p><p>When you are done using the context, call <a href="#CUDA.unsafe_destroy!-Tuple{CuContext}"><code>CUDA.unsafe_destroy!</code></a> to mark it for deletion, or use do-block syntax with this constructor.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/context.jl#L13-L23">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.unsafe_destroy!-Tuple{CuContext}" href="#CUDA.unsafe_destroy!-Tuple{CuContext}"><code>CUDA.unsafe_destroy!</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">unsafe_destroy!(ctx::CuContext)</code></pre><p>Immediately destroy a context, freeing up all resources associated with it. This does not respect any users of the context, and might make other objects unusable.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/context.jl#L90-L95">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.current_context" href="#CUDA.current_context"><code>CUDA.current_context</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">current_context()</code></pre><p>Returns the current context. Throws an undefined reference error if the current thread has no context bound to it, or if the bound context has been destroyed.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>This is a low-level API, returning the current context as known to the CUDA driver. For most users, it is recommended to use the <a href="../../api/essentials/#CUDA.context"><code>context</code></a> method instead.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/context.jl#L51-L61">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.activate-Tuple{CuContext}" href="#CUDA.activate-Tuple{CuContext}"><code>CUDA.activate</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">activate(ctx::CuContext)</code></pre><p>Binds the specified CUDA context to the calling CPU thread.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/context.jl#L140-L144">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.synchronize-Tuple{CuContext}" href="#CUDA.synchronize-Tuple{CuContext}"><code>CUDA.synchronize</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">synchronize(ctx::Context)</code></pre><p>Block for the all operations on <code>ctx</code> to complete. This is a heavyweight operation, typically you only need to call <a href="#CUDA.synchronize-Tuple{CuContext}"><code>synchronize</code></a> which only synchronizes the stream associated with the current task.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/context.jl#L292-L298">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.device_synchronize" href="#CUDA.device_synchronize"><code>CUDA.device_synchronize</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">device_synchronize()</code></pre><p>Block for the all operations on <code>ctx</code> to complete. This is a heavyweight operation, typically you only need to call <a href="#CUDA.synchronize-Tuple{CuContext}"><code>synchronize</code></a> which only synchronizes the stream associated with the current task.</p><p>On the device, <code>device_synchronize</code> acts as a synchronization point for child grids in the context of dynamic parallelism.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/context.jl#L309-L318">source</a></section></article><h3 id="Primary-Context-Management"><a class="docs-heading-anchor" href="#Primary-Context-Management">Primary Context Management</a><a id="Primary-Context-Management-1"></a><a class="docs-heading-anchor-permalink" href="#Primary-Context-Management" title="Permalink"></a></h3><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuPrimaryContext" href="#CUDA.CuPrimaryContext"><code>CUDA.CuPrimaryContext</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuPrimaryContext(dev::CuDevice)</code></pre><p>Create a primary CUDA context for a given device.</p><p>Each primary context is unique per device and is shared with CUDA runtime API. It is meant for interoperability with (applications using) the runtime API.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/context.jl#L172-L179">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuContext-Tuple{CuPrimaryContext}" href="#CUDA.CuContext-Tuple{CuPrimaryContext}"><code>CUDA.CuContext</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">CuContext(pctx::CuPrimaryContext)</code></pre><p>Derive a context from a primary context.</p><p>Calling this function increases the reference count of the primary context. The returned context <em>should not</em> be free with the <code>unsafe_destroy!</code> function that&#39;s used with ordinary contexts. Instead, the refcount of the primary context should be decreased by calling <code>unsafe_release!</code>, or set to zero by calling <code>unsafe_reset!</code>. The easiest way to do this is by using the <code>do</code>-block syntax.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/context.jl#L184-L194">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.isactive-Tuple{CuPrimaryContext}" href="#CUDA.isactive-Tuple{CuPrimaryContext}"><code>CUDA.isactive</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">isactive(pctx::CuPrimaryContext)</code></pre><p>Query whether a primary context is active.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/context.jl#L250-L254">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.flags-Tuple{CuPrimaryContext}" href="#CUDA.flags-Tuple{CuPrimaryContext}"><code>CUDA.flags</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">flags(pctx::CuPrimaryContext)</code></pre><p>Query the flags of a primary context.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/context.jl#L257-L261">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.setflags!-Tuple{CuPrimaryContext, CUDA.CUctx_flags_enum}" href="#CUDA.setflags!-Tuple{CuPrimaryContext, CUDA.CUctx_flags_enum}"><code>CUDA.setflags!</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">setflags!(pctx::CuPrimaryContext)</code></pre><p>Set the flags of a primary context.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/context.jl#L264-L268">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.unsafe_reset!-Tuple{CuPrimaryContext}" href="#CUDA.unsafe_reset!-Tuple{CuPrimaryContext}"><code>CUDA.unsafe_reset!</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">unsafe_reset!(pctx::CuPrimaryContext)</code></pre><p>Explicitly destroys and cleans up all resources associated with a device&#39;s primary context in the current process. Note that this forcibly invalidates all contexts derived from this primary context, and as a result outstanding resources might become invalid.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/context.jl#L226-L232">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.unsafe_release!-Tuple{CuPrimaryContext}" href="#CUDA.unsafe_release!-Tuple{CuPrimaryContext}"><code>CUDA.unsafe_release!</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">CUDA.unsafe_release!(pctx::CuPrimaryContext)</code></pre><p>Lower the refcount of a context, possibly freeing up all resources associated with it. This does not respect any users of the context, and might make other objects unusable.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/context.jl#L210-L215">source</a></section></article><h2 id="Module-Management"><a class="docs-heading-anchor" href="#Module-Management">Module Management</a><a id="Module-Management-1"></a><a class="docs-heading-anchor-permalink" href="#Module-Management" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuModule" href="#CUDA.CuModule"><code>CUDA.CuModule</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuModule(data, options::Dict{CUjit_option,Any})
-CuModuleFile(path, options::Dict{CUjit_option,Any})</code></pre><p>Create a CUDA module from a data, or a file containing data. The data may be PTX code, a CUBIN, or a FATBIN.</p><p>The <code>options</code> is an optional dictionary of JIT options and their respective value.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/module.jl#L35-L43">source</a></section></article><h3 id="Function-Management"><a class="docs-heading-anchor" href="#Function-Management">Function Management</a><a id="Function-Management-1"></a><a class="docs-heading-anchor-permalink" href="#Function-Management" title="Permalink"></a></h3><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuFunction" href="#CUDA.CuFunction"><code>CUDA.CuFunction</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuFunction(mod::CuModule, name::String)</code></pre><p>Acquires a function handle from a named function in a module.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/module/function.jl#L7-L11">source</a></section></article><h3 id="Global-Variable-Management"><a class="docs-heading-anchor" href="#Global-Variable-Management">Global Variable Management</a><a id="Global-Variable-Management-1"></a><a class="docs-heading-anchor-permalink" href="#Global-Variable-Management" title="Permalink"></a></h3><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuGlobal" href="#CUDA.CuGlobal"><code>CUDA.CuGlobal</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuGlobal{T}(mod::CuModule, name::String)</code></pre><p>Acquires a typed global variable handle from a named global in a module.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/module/global.jl#L10-L14">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Base.eltype-Tuple{CuGlobal}" href="#Base.eltype-Tuple{CuGlobal}"><code>Base.eltype</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">eltype(var::CuGlobal)</code></pre><p>Return the element type of a global variable object.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/module/global.jl#L36-L40">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Base.getindex-Tuple{CuGlobal}" href="#Base.getindex-Tuple{CuGlobal}"><code>Base.getindex</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">Base.getindex(var::CuGlobal)</code></pre><p>Return the current value of a global variable.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/module/global.jl#L43-L47">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Base.setindex!-Union{Tuple{T}, Tuple{CuGlobal{T}, T}} where T" href="#Base.setindex!-Union{Tuple{T}, Tuple{CuGlobal{T}, T}} where T"><code>Base.setindex!</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">Base.setindex(var::CuGlobal{T}, val::T)</code></pre><p>Set the value of a global variable to <code>val</code></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/module/global.jl#L59-L63">source</a></section></article><h3 id="Linker"><a class="docs-heading-anchor" href="#Linker">Linker</a><a id="Linker-1"></a><a class="docs-heading-anchor-permalink" href="#Linker" title="Permalink"></a></h3><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuLink" href="#CUDA.CuLink"><code>CUDA.CuLink</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuLink()</code></pre><p>Creates a pending JIT linker invocation.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/module/linker.jl#L7-L11">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.add_data!" href="#CUDA.add_data!"><code>CUDA.add_data!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">add_data!(link::CuLink, name::String, code::String)</code></pre><p>Add PTX code to a pending link operation.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/module/linker.jl#L63-L67">source</a></section><section><div><pre><code class="language-julia hljs">add_data!(link::CuLink, name::String, data::Vector{UInt8})</code></pre><p>Add object code to a pending link operation.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/module/linker.jl#L90-L94">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.add_file!" href="#CUDA.add_file!"><code>CUDA.add_file!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">add_file!(link::CuLink, path::String, typ::CUjitInputType)</code></pre><p>Add data from a file to a link operation. The argument <code>typ</code> indicates the type of the contained data.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/module/linker.jl#L110-L115">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuLinkImage" href="#CUDA.CuLinkImage"><code>CUDA.CuLinkImage</code></a> — <span class="docstring-category">Type</span></header><section><div><p>The result of a linking operation.</p><p>This object keeps its parent linker object alive, as destroying a linker destroys linked images too.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/module/linker.jl#L122-L127">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.complete" href="#CUDA.complete"><code>CUDA.complete</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">complete(link::CuLink)</code></pre><p>Complete a pending linker invocation, returning an output image.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/module/linker.jl#L134-L138">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuModule-Tuple{CuLinkImage, Vararg{Any}}" href="#CUDA.CuModule-Tuple{CuLinkImage, Vararg{Any}}"><code>CUDA.CuModule</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">CuModule(img::CuLinkImage, ...)</code></pre><p>Create a CUDA module from a completed linking operation. Options from <code>CuModule</code> apply.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/module/linker.jl#L166-L170">source</a></section></article><h2 id="Memory-Management"><a class="docs-heading-anchor" href="#Memory-Management">Memory Management</a><a id="Memory-Management-1"></a><a class="docs-heading-anchor-permalink" href="#Memory-Management" title="Permalink"></a></h2><p>Different kinds of memory objects can be created, representing different kinds of memory that the CUDA toolkit supports. Each of these memory objects can be allocated by calling <code>alloc</code> with the type of memory as first argument, and freed by calling <code>free</code>. Certain kinds of memory have specific methods defined.</p><h3 id="Device-memory"><a class="docs-heading-anchor" href="#Device-memory">Device memory</a><a id="Device-memory-1"></a><a class="docs-heading-anchor-permalink" href="#Device-memory" title="Permalink"></a></h3><p>This memory is accessible only by the GPU, and is the most common kind of memory used in CUDA programming.</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.DeviceMemory" href="#CUDA.DeviceMemory"><code>CUDA.DeviceMemory</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">DeviceMemory</code></pre><p>Device memory residing on the GPU.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/memory.jl#L28-L32">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.alloc-Tuple{Type{CUDA.DeviceMemory}, Integer}" href="#CUDA.alloc-Tuple{Type{CUDA.DeviceMemory}, Integer}"><code>CUDA.alloc</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">alloc(DeviceMemory, bytesize::Integer;
-      [async=false], [stream::CuStream], [pool::CuMemoryPool])</code></pre><p>Allocate <code>bytesize</code> bytes of memory on the device. This memory is only accessible on the GPU, and requires explicit calls to <code>unsafe_copyto!</code>, which wraps <code>cuMemcpy</code>, for access on the CPU.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/memory.jl#L53-L60">source</a></section></article><h3 id="Unified-memory"><a class="docs-heading-anchor" href="#Unified-memory">Unified memory</a><a id="Unified-memory-1"></a><a class="docs-heading-anchor-permalink" href="#Unified-memory" title="Permalink"></a></h3><p>Unified memory is accessible by both the CPU and the GPU, and is managed by the CUDA runtime. It is automatically migrated between the CPU and the GPU as needed, which simplifies programming but can lead to performance issues if not used carefully.</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.UnifiedMemory" href="#CUDA.UnifiedMemory"><code>CUDA.UnifiedMemory</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">UnifiedMemory</code></pre><p>Unified memory that is accessible on both the CPU and GPU.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/memory.jl#L194-L198">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.alloc-Tuple{Type{CUDA.UnifiedMemory}, Integer, CUDA.CUmemAttach_flags_enum}" href="#CUDA.alloc-Tuple{Type{CUDA.UnifiedMemory}, Integer, CUDA.CUmemAttach_flags_enum}"><code>CUDA.alloc</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">alloc(UnifiedMemory, bytesize::Integer, [flags::CUmemAttach_flags])</code></pre><p>Allocate <code>bytesize</code> bytes of unified memory. This memory is accessible from both the CPU and GPU, with the CUDA driver automatically copying upon first access.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/memory.jl#L221-L226">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.prefetch-Tuple{CUDA.UnifiedMemory, Integer}" href="#CUDA.prefetch-Tuple{CUDA.UnifiedMemory, Integer}"><code>CUDA.prefetch</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">prefetch(::UnifiedMemory, [bytes::Integer]; [device::CuDevice], [stream::CuStream])</code></pre><p>Prefetches memory to the specified destination device.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/memory.jl#L245-L249">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.advise-Tuple{CUDA.UnifiedMemory, CUDA.CUmem_advise_enum, Integer}" href="#CUDA.advise-Tuple{CUDA.UnifiedMemory, CUDA.CUmem_advise_enum, Integer}"><code>CUDA.advise</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">advise(::UnifiedMemory, advice::CUDA.CUmem_advise, [bytes::Integer]; [device::CuDevice])</code></pre><p>Advise about the usage of a given memory range.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/memory.jl#L259-L263">source</a></section></article><h3 id="Host-memory"><a class="docs-heading-anchor" href="#Host-memory">Host memory</a><a id="Host-memory-1"></a><a class="docs-heading-anchor-permalink" href="#Host-memory" title="Permalink"></a></h3><p>Host memory resides on the CPU, but is accessible by the GPU via the PCI bus. This is the slowest kind of memory, but is useful for communicating between running kernels and the host (e.g., to update counters or flags).</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.HostMemory" href="#CUDA.HostMemory"><code>CUDA.HostMemory</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">HostMemory</code></pre><p>Pinned memory residing on the CPU, possibly accessible on the GPU.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/memory.jl#L96-L100">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.alloc-Tuple{Type{CUDA.HostMemory}, Integer, Any}" href="#CUDA.alloc-Tuple{Type{CUDA.HostMemory}, Integer, Any}"><code>CUDA.alloc</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">alloc(HostMemory, bytesize::Integer, [flags])</code></pre><p>Allocate <code>bytesize</code> bytes of page-locked memory on the host. This memory is accessible from the CPU, and makes it possible to perform faster memory copies to the GPU. Furthermore, if <code>flags</code> is set to <code>MEMHOSTALLOC_DEVICEMAP</code> the memory is also accessible from the GPU. These accesses are direct, and go through the PCI bus. If <code>flags</code> is set to <code>MEMHOSTALLOC_PORTABLE</code>, the memory is considered mapped by all CUDA contexts, not just the one that created the memory, which is useful if the memory needs to be accessed from multiple devices. Multiple <code>flags</code> can be set at one time using a bytewise <code>OR</code>:</p><pre><code class="nohighlight hljs">flags = MEMHOSTALLOC_PORTABLE | MEMHOSTALLOC_DEVICEMAP</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/memory.jl#L130-L143">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.register-Tuple{Type{CUDA.HostMemory}, Ptr, Integer, Any}" href="#CUDA.register-Tuple{Type{CUDA.HostMemory}, Ptr, Integer, Any}"><code>CUDA.register</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">register(HostMemory, ptr::Ptr, bytesize::Integer, [flags])</code></pre><p>Page-lock the host memory pointed to by <code>ptr</code>. Subsequent transfers to and from devices will be faster, and can be executed asynchronously. If the <code>MEMHOSTREGISTER_DEVICEMAP</code> flag is specified, the buffer will also be accessible directly from the GPU. These accesses are direct, and go through the PCI bus. If the <code>MEMHOSTREGISTER_PORTABLE</code> flag is specified, any CUDA context can access the memory.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/memory.jl#L158-L166">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.unregister-Tuple{CUDA.HostMemory}" href="#CUDA.unregister-Tuple{CUDA.HostMemory}"><code>CUDA.unregister</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">unregister(::HostMemory)</code></pre><p>Unregisters a memory range that was registered with <a href="#CUDA.register-Tuple{Type{CUDA.HostMemory}, Ptr, Integer, Any}"><code>register</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/memory.jl#L175-L179">source</a></section></article><h3 id="Array-memory"><a class="docs-heading-anchor" href="#Array-memory">Array memory</a><a id="Array-memory-1"></a><a class="docs-heading-anchor-permalink" href="#Array-memory" title="Permalink"></a></h3><p>Array memory is a special kind of memory that is optimized for 2D and 3D access patterns. The memory is opaquely managed by the CUDA runtime, and is typically only used on combination with texture intrinsics.</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.ArrayMemory" href="#CUDA.ArrayMemory"><code>CUDA.ArrayMemory</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">ArrayMemory</code></pre><p>Array memory residing on the GPU, possibly in a specially-formatted way.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/memory.jl#L273-L277">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.alloc-Union{Tuple{T}, Tuple{Type{CUDA.ArrayMemory{T}}, Tuple{Vararg{Int64, N}} where N}} where T" href="#CUDA.alloc-Union{Tuple{T}, Tuple{Type{CUDA.ArrayMemory{T}}, Tuple{Vararg{Int64, N}} where N}} where T"><code>CUDA.alloc</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">alloc(ArrayMemory, dims::Dims)</code></pre><p>Allocate array memory with dimensions <code>dims</code>. The memory is accessible on the GPU, but can only be used in conjunction with special intrinsics (e.g., texture intrinsics).</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/memory.jl#L302-L307">source</a></section></article><h3 id="Pointers"><a class="docs-heading-anchor" href="#Pointers">Pointers</a><a id="Pointers-1"></a><a class="docs-heading-anchor-permalink" href="#Pointers" title="Permalink"></a></h3><p>To work with these buffers, you need to <code>convert</code> them to a <code>Ptr</code>, <code>CuPtr</code>, or in the case of <code>ArrayMemory</code> an <code>CuArrayPtr</code>. You can then use common Julia methods on these pointers, such as <code>unsafe_copyto!</code>. CUDA.jl also provides some specialized functionality that does not match standard Julia functionality:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.unsafe_copy2d!" href="#CUDA.unsafe_copy2d!"><code>CUDA.unsafe_copy2d!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">unsafe_copy2d!(dst, dstTyp, src, srcTyp, width, height=1;
+&quot;ERROR_INVALID_VALUE&quot;</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/error.jl#L39-L51">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.description-Tuple{CuError}" href="#CUDA.description-Tuple{CuError}"><code>CUDA.description</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">description(err::CuError)</code></pre><p>Gets the string description of an error code.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/error.jl#L58-L62">source</a></section></article><h2 id="Version-Management"><a class="docs-heading-anchor" href="#Version-Management">Version Management</a><a id="Version-Management-1"></a><a class="docs-heading-anchor-permalink" href="#Version-Management" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.driver_version-Tuple{}" href="#CUDA.driver_version-Tuple{}"><code>CUDA.driver_version</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">driver_version()</code></pre><p>Returns the latest version of CUDA supported by the loaded driver.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/version.jl#L14-L18">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.runtime_version-Tuple{}" href="#CUDA.runtime_version-Tuple{}"><code>CUDA.runtime_version</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">runtime_version()</code></pre><p>Returns the CUDA Runtime version.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/version.jl#L24-L28">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.set_runtime_version!" href="#CUDA.set_runtime_version!"><code>CUDA.set_runtime_version!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">CUDA.set_runtime_version!([version::VersionNumber]; [local_toolkit::Bool])</code></pre><p>Configures the active project to use a specific CUDA toolkit version from a specific source.</p><p>If <code>local_toolkit</code> is set, the CUDA toolkit will be used from the local system, otherwise it will be downloaded from an artifact source. In the case of a local toolkit, <code>version</code> informs CUDA.jl which version that is (this may be useful if auto-detection fails). In the case of artifact sources, <code>version</code> controls which version will be downloaded and used.</p><p>When not specifying either the <code>version</code> or the <code>local_toolkit</code> argument, the default behavior will be used, which is to use the most recent compatible runtime available from an artifact source. Note that this will override any Preferences that may be configured in a higher-up depot; to clear preferences nondestructively, use <a href="#CUDA.reset_runtime_version!"><code>CUDA.reset_runtime_version!</code></a> instead.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/version.jl#L39-L54">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.reset_runtime_version!" href="#CUDA.reset_runtime_version!"><code>CUDA.reset_runtime_version!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">CUDA.reset_runtime_version!()</code></pre><p>Resets the CUDA version preferences in the active project to the default, which is to use the most recent compatible runtime available from an artifact source, unless a higher-up depot has configured a different preference. To force use of the default behavior for the local project, use <a href="#CUDA.set_runtime_version!"><code>CUDA.set_runtime_version!</code></a> with no arguments.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/version.jl#L79-L86">source</a></section></article><h2 id="Device-Management"><a class="docs-heading-anchor" href="#Device-Management">Device Management</a><a id="Device-Management-1"></a><a class="docs-heading-anchor-permalink" href="#Device-Management" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuDevice" href="#CUDA.CuDevice"><code>CUDA.CuDevice</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuDevice(ordinal::Integer)</code></pre><p>Get a handle to a compute device.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/devices.jl#L7-L11">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.devices" href="#CUDA.devices"><code>CUDA.devices</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">devices()</code></pre><p>Get an iterator for the compute devices.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/devices.jl#L121-L125">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.current_device" href="#CUDA.current_device"><code>CUDA.current_device</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">current_device()</code></pre><p>Returns the current device.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>This is a low-level API, returning the current device as known to the CUDA driver. For most users, it is recommended to use the <a href="../../api/essentials/#CUDA.device"><code>device</code></a> method instead.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/devices.jl#L33-L42">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.name-Tuple{CuDevice}" href="#CUDA.name-Tuple{CuDevice}"><code>CUDA.name</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">name(dev::CuDevice)</code></pre><p>Returns an identifier string for the device.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/devices.jl#L61-L65">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.totalmem-Tuple{CuDevice}" href="#CUDA.totalmem-Tuple{CuDevice}"><code>CUDA.totalmem</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">totalmem(dev::CuDevice)</code></pre><p>Returns the total amount of memory (in bytes) on the device.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/devices.jl#L97-L101">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.attribute" href="#CUDA.attribute"><code>CUDA.attribute</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">attribute(dev::CuDevice, code)</code></pre><p>Returns information about the device.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/devices.jl#L159-L163">source</a></section><section><div><pre><code class="language-julia hljs">attribute(X, pool::CuMemoryPool, attr)</code></pre><p>Returns attribute <code>attr</code> about <code>pool</code>. The type of the returned value depends on the attribute, and as such must be passed as the <code>X</code> parameter.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/pool.jl#L76-L81">source</a></section><section><div><pre><code class="language-julia hljs">attribute(X, ptr::Union{Ptr,CuPtr}, attr)</code></pre><p>Returns attribute <code>attr</code> about pointer <code>ptr</code>. The type of the returned value depends on the attribute, and as such must be passed as the <code>X</code> parameter.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/memory.jl#L766-L771">source</a></section></article><p>Certain common attributes are exposed by additional convenience functions:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.capability-Tuple{CuDevice}" href="#CUDA.capability-Tuple{CuDevice}"><code>CUDA.capability</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">capability(dev::CuDevice)</code></pre><p>Returns the compute capability of the device.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/devices.jl#L179-L183">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.warpsize-Tuple{CuDevice}" href="#CUDA.warpsize-Tuple{CuDevice}"><code>CUDA.warpsize</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">warpsize(dev::CuDevice)</code></pre><p>Returns the warp size (in threads) of the device.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/devices.jl#L172-L176">source</a></section></article><h2 id="Context-Management"><a class="docs-heading-anchor" href="#Context-Management">Context Management</a><a id="Context-Management-1"></a><a class="docs-heading-anchor-permalink" href="#Context-Management" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuContext" href="#CUDA.CuContext"><code>CUDA.CuContext</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuContext(dev::CuDevice, flags=CTX_SCHED_AUTO)
+CuContext(f::Function, ...)</code></pre><p>Create a CUDA context for device. A context on the GPU is analogous to a process on the CPU, with its own distinct address space and allocated resources. When a context is destroyed, the system cleans up the resources allocated to it.</p><p>When you are done using the context, call <a href="#CUDA.unsafe_destroy!-Tuple{CuContext}"><code>CUDA.unsafe_destroy!</code></a> to mark it for deletion, or use do-block syntax with this constructor.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/context.jl#L13-L23">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.unsafe_destroy!-Tuple{CuContext}" href="#CUDA.unsafe_destroy!-Tuple{CuContext}"><code>CUDA.unsafe_destroy!</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">unsafe_destroy!(ctx::CuContext)</code></pre><p>Immediately destroy a context, freeing up all resources associated with it. This does not respect any users of the context, and might make other objects unusable.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/context.jl#L90-L95">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.current_context" href="#CUDA.current_context"><code>CUDA.current_context</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">current_context()</code></pre><p>Returns the current context. Throws an undefined reference error if the current thread has no context bound to it, or if the bound context has been destroyed.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>This is a low-level API, returning the current context as known to the CUDA driver. For most users, it is recommended to use the <a href="../../api/essentials/#CUDA.context"><code>context</code></a> method instead.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/context.jl#L51-L61">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.activate-Tuple{CuContext}" href="#CUDA.activate-Tuple{CuContext}"><code>CUDA.activate</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">activate(ctx::CuContext)</code></pre><p>Binds the specified CUDA context to the calling CPU thread.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/context.jl#L140-L144">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.synchronize-Tuple{CuContext}" href="#CUDA.synchronize-Tuple{CuContext}"><code>CUDA.synchronize</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">synchronize(ctx::Context)</code></pre><p>Block for the all operations on <code>ctx</code> to complete. This is a heavyweight operation, typically you only need to call <a href="#CUDA.synchronize-Tuple{CuContext}"><code>synchronize</code></a> which only synchronizes the stream associated with the current task.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/context.jl#L292-L298">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.device_synchronize" href="#CUDA.device_synchronize"><code>CUDA.device_synchronize</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">device_synchronize()</code></pre><p>Block for the all operations on <code>ctx</code> to complete. This is a heavyweight operation, typically you only need to call <a href="#CUDA.synchronize-Tuple{CuContext}"><code>synchronize</code></a> which only synchronizes the stream associated with the current task.</p><p>On the device, <code>device_synchronize</code> acts as a synchronization point for child grids in the context of dynamic parallelism.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/context.jl#L309-L318">source</a></section></article><h3 id="Primary-Context-Management"><a class="docs-heading-anchor" href="#Primary-Context-Management">Primary Context Management</a><a id="Primary-Context-Management-1"></a><a class="docs-heading-anchor-permalink" href="#Primary-Context-Management" title="Permalink"></a></h3><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuPrimaryContext" href="#CUDA.CuPrimaryContext"><code>CUDA.CuPrimaryContext</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuPrimaryContext(dev::CuDevice)</code></pre><p>Create a primary CUDA context for a given device.</p><p>Each primary context is unique per device and is shared with CUDA runtime API. It is meant for interoperability with (applications using) the runtime API.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/context.jl#L172-L179">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuContext-Tuple{CuPrimaryContext}" href="#CUDA.CuContext-Tuple{CuPrimaryContext}"><code>CUDA.CuContext</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">CuContext(pctx::CuPrimaryContext)</code></pre><p>Derive a context from a primary context.</p><p>Calling this function increases the reference count of the primary context. The returned context <em>should not</em> be free with the <code>unsafe_destroy!</code> function that&#39;s used with ordinary contexts. Instead, the refcount of the primary context should be decreased by calling <code>unsafe_release!</code>, or set to zero by calling <code>unsafe_reset!</code>. The easiest way to do this is by using the <code>do</code>-block syntax.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/context.jl#L184-L194">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.isactive-Tuple{CuPrimaryContext}" href="#CUDA.isactive-Tuple{CuPrimaryContext}"><code>CUDA.isactive</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">isactive(pctx::CuPrimaryContext)</code></pre><p>Query whether a primary context is active.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/context.jl#L250-L254">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.flags-Tuple{CuPrimaryContext}" href="#CUDA.flags-Tuple{CuPrimaryContext}"><code>CUDA.flags</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">flags(pctx::CuPrimaryContext)</code></pre><p>Query the flags of a primary context.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/context.jl#L257-L261">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.setflags!-Tuple{CuPrimaryContext, CUDA.CUctx_flags_enum}" href="#CUDA.setflags!-Tuple{CuPrimaryContext, CUDA.CUctx_flags_enum}"><code>CUDA.setflags!</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">setflags!(pctx::CuPrimaryContext)</code></pre><p>Set the flags of a primary context.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/context.jl#L264-L268">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.unsafe_reset!-Tuple{CuPrimaryContext}" href="#CUDA.unsafe_reset!-Tuple{CuPrimaryContext}"><code>CUDA.unsafe_reset!</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">unsafe_reset!(pctx::CuPrimaryContext)</code></pre><p>Explicitly destroys and cleans up all resources associated with a device&#39;s primary context in the current process. Note that this forcibly invalidates all contexts derived from this primary context, and as a result outstanding resources might become invalid.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/context.jl#L226-L232">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.unsafe_release!-Tuple{CuPrimaryContext}" href="#CUDA.unsafe_release!-Tuple{CuPrimaryContext}"><code>CUDA.unsafe_release!</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">CUDA.unsafe_release!(pctx::CuPrimaryContext)</code></pre><p>Lower the refcount of a context, possibly freeing up all resources associated with it. This does not respect any users of the context, and might make other objects unusable.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/context.jl#L210-L215">source</a></section></article><h2 id="Module-Management"><a class="docs-heading-anchor" href="#Module-Management">Module Management</a><a id="Module-Management-1"></a><a class="docs-heading-anchor-permalink" href="#Module-Management" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuModule" href="#CUDA.CuModule"><code>CUDA.CuModule</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuModule(data, options::Dict{CUjit_option,Any})
+CuModuleFile(path, options::Dict{CUjit_option,Any})</code></pre><p>Create a CUDA module from a data, or a file containing data. The data may be PTX code, a CUBIN, or a FATBIN.</p><p>The <code>options</code> is an optional dictionary of JIT options and their respective value.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/module.jl#L35-L43">source</a></section></article><h3 id="Function-Management"><a class="docs-heading-anchor" href="#Function-Management">Function Management</a><a id="Function-Management-1"></a><a class="docs-heading-anchor-permalink" href="#Function-Management" title="Permalink"></a></h3><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuFunction" href="#CUDA.CuFunction"><code>CUDA.CuFunction</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuFunction(mod::CuModule, name::String)</code></pre><p>Acquires a function handle from a named function in a module.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/module/function.jl#L7-L11">source</a></section></article><h3 id="Global-Variable-Management"><a class="docs-heading-anchor" href="#Global-Variable-Management">Global Variable Management</a><a id="Global-Variable-Management-1"></a><a class="docs-heading-anchor-permalink" href="#Global-Variable-Management" title="Permalink"></a></h3><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuGlobal" href="#CUDA.CuGlobal"><code>CUDA.CuGlobal</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuGlobal{T}(mod::CuModule, name::String)</code></pre><p>Acquires a typed global variable handle from a named global in a module.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/module/global.jl#L10-L14">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Base.eltype-Tuple{CuGlobal}" href="#Base.eltype-Tuple{CuGlobal}"><code>Base.eltype</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">eltype(var::CuGlobal)</code></pre><p>Return the element type of a global variable object.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/module/global.jl#L36-L40">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Base.getindex-Tuple{CuGlobal}" href="#Base.getindex-Tuple{CuGlobal}"><code>Base.getindex</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">Base.getindex(var::CuGlobal)</code></pre><p>Return the current value of a global variable.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/module/global.jl#L43-L47">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Base.setindex!-Union{Tuple{T}, Tuple{CuGlobal{T}, T}} where T" href="#Base.setindex!-Union{Tuple{T}, Tuple{CuGlobal{T}, T}} where T"><code>Base.setindex!</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">Base.setindex(var::CuGlobal{T}, val::T)</code></pre><p>Set the value of a global variable to <code>val</code></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/module/global.jl#L59-L63">source</a></section></article><h3 id="Linker"><a class="docs-heading-anchor" href="#Linker">Linker</a><a id="Linker-1"></a><a class="docs-heading-anchor-permalink" href="#Linker" title="Permalink"></a></h3><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuLink" href="#CUDA.CuLink"><code>CUDA.CuLink</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuLink()</code></pre><p>Creates a pending JIT linker invocation.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/module/linker.jl#L7-L11">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.add_data!" href="#CUDA.add_data!"><code>CUDA.add_data!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">add_data!(link::CuLink, name::String, code::String)</code></pre><p>Add PTX code to a pending link operation.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/module/linker.jl#L63-L67">source</a></section><section><div><pre><code class="language-julia hljs">add_data!(link::CuLink, name::String, data::Vector{UInt8})</code></pre><p>Add object code to a pending link operation.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/module/linker.jl#L90-L94">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.add_file!" href="#CUDA.add_file!"><code>CUDA.add_file!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">add_file!(link::CuLink, path::String, typ::CUjitInputType)</code></pre><p>Add data from a file to a link operation. The argument <code>typ</code> indicates the type of the contained data.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/module/linker.jl#L110-L115">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuLinkImage" href="#CUDA.CuLinkImage"><code>CUDA.CuLinkImage</code></a> — <span class="docstring-category">Type</span></header><section><div><p>The result of a linking operation.</p><p>This object keeps its parent linker object alive, as destroying a linker destroys linked images too.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/module/linker.jl#L122-L127">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.complete" href="#CUDA.complete"><code>CUDA.complete</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">complete(link::CuLink)</code></pre><p>Complete a pending linker invocation, returning an output image.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/module/linker.jl#L134-L138">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuModule-Tuple{CuLinkImage, Vararg{Any}}" href="#CUDA.CuModule-Tuple{CuLinkImage, Vararg{Any}}"><code>CUDA.CuModule</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">CuModule(img::CuLinkImage, ...)</code></pre><p>Create a CUDA module from a completed linking operation. Options from <code>CuModule</code> apply.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/module/linker.jl#L166-L170">source</a></section></article><h2 id="Memory-Management"><a class="docs-heading-anchor" href="#Memory-Management">Memory Management</a><a id="Memory-Management-1"></a><a class="docs-heading-anchor-permalink" href="#Memory-Management" title="Permalink"></a></h2><p>Different kinds of memory objects can be created, representing different kinds of memory that the CUDA toolkit supports. Each of these memory objects can be allocated by calling <code>alloc</code> with the type of memory as first argument, and freed by calling <code>free</code>. Certain kinds of memory have specific methods defined.</p><h3 id="Device-memory"><a class="docs-heading-anchor" href="#Device-memory">Device memory</a><a id="Device-memory-1"></a><a class="docs-heading-anchor-permalink" href="#Device-memory" title="Permalink"></a></h3><p>This memory is accessible only by the GPU, and is the most common kind of memory used in CUDA programming.</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.DeviceMemory" href="#CUDA.DeviceMemory"><code>CUDA.DeviceMemory</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">DeviceMemory</code></pre><p>Device memory residing on the GPU.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/memory.jl#L28-L32">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.alloc-Tuple{Type{CUDA.DeviceMemory}, Integer}" href="#CUDA.alloc-Tuple{Type{CUDA.DeviceMemory}, Integer}"><code>CUDA.alloc</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">alloc(DeviceMemory, bytesize::Integer;
+      [async=false], [stream::CuStream], [pool::CuMemoryPool])</code></pre><p>Allocate <code>bytesize</code> bytes of memory on the device. This memory is only accessible on the GPU, and requires explicit calls to <code>unsafe_copyto!</code>, which wraps <code>cuMemcpy</code>, for access on the CPU.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/memory.jl#L53-L60">source</a></section></article><h3 id="Unified-memory"><a class="docs-heading-anchor" href="#Unified-memory">Unified memory</a><a id="Unified-memory-1"></a><a class="docs-heading-anchor-permalink" href="#Unified-memory" title="Permalink"></a></h3><p>Unified memory is accessible by both the CPU and the GPU, and is managed by the CUDA runtime. It is automatically migrated between the CPU and the GPU as needed, which simplifies programming but can lead to performance issues if not used carefully.</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.UnifiedMemory" href="#CUDA.UnifiedMemory"><code>CUDA.UnifiedMemory</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">UnifiedMemory</code></pre><p>Unified memory that is accessible on both the CPU and GPU.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/memory.jl#L194-L198">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.alloc-Tuple{Type{CUDA.UnifiedMemory}, Integer, CUDA.CUmemAttach_flags_enum}" href="#CUDA.alloc-Tuple{Type{CUDA.UnifiedMemory}, Integer, CUDA.CUmemAttach_flags_enum}"><code>CUDA.alloc</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">alloc(UnifiedMemory, bytesize::Integer, [flags::CUmemAttach_flags])</code></pre><p>Allocate <code>bytesize</code> bytes of unified memory. This memory is accessible from both the CPU and GPU, with the CUDA driver automatically copying upon first access.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/memory.jl#L221-L226">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.prefetch-Tuple{CUDA.UnifiedMemory, Integer}" href="#CUDA.prefetch-Tuple{CUDA.UnifiedMemory, Integer}"><code>CUDA.prefetch</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">prefetch(::UnifiedMemory, [bytes::Integer]; [device::CuDevice], [stream::CuStream])</code></pre><p>Prefetches memory to the specified destination device.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/memory.jl#L245-L249">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.advise-Tuple{CUDA.UnifiedMemory, CUDA.CUmem_advise_enum, Integer}" href="#CUDA.advise-Tuple{CUDA.UnifiedMemory, CUDA.CUmem_advise_enum, Integer}"><code>CUDA.advise</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">advise(::UnifiedMemory, advice::CUDA.CUmem_advise, [bytes::Integer]; [device::CuDevice])</code></pre><p>Advise about the usage of a given memory range.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/memory.jl#L259-L263">source</a></section></article><h3 id="Host-memory"><a class="docs-heading-anchor" href="#Host-memory">Host memory</a><a id="Host-memory-1"></a><a class="docs-heading-anchor-permalink" href="#Host-memory" title="Permalink"></a></h3><p>Host memory resides on the CPU, but is accessible by the GPU via the PCI bus. This is the slowest kind of memory, but is useful for communicating between running kernels and the host (e.g., to update counters or flags).</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.HostMemory" href="#CUDA.HostMemory"><code>CUDA.HostMemory</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">HostMemory</code></pre><p>Pinned memory residing on the CPU, possibly accessible on the GPU.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/memory.jl#L96-L100">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.alloc-Tuple{Type{CUDA.HostMemory}, Integer, Any}" href="#CUDA.alloc-Tuple{Type{CUDA.HostMemory}, Integer, Any}"><code>CUDA.alloc</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">alloc(HostMemory, bytesize::Integer, [flags])</code></pre><p>Allocate <code>bytesize</code> bytes of page-locked memory on the host. This memory is accessible from the CPU, and makes it possible to perform faster memory copies to the GPU. Furthermore, if <code>flags</code> is set to <code>MEMHOSTALLOC_DEVICEMAP</code> the memory is also accessible from the GPU. These accesses are direct, and go through the PCI bus. If <code>flags</code> is set to <code>MEMHOSTALLOC_PORTABLE</code>, the memory is considered mapped by all CUDA contexts, not just the one that created the memory, which is useful if the memory needs to be accessed from multiple devices. Multiple <code>flags</code> can be set at one time using a bytewise <code>OR</code>:</p><pre><code class="nohighlight hljs">flags = MEMHOSTALLOC_PORTABLE | MEMHOSTALLOC_DEVICEMAP</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/memory.jl#L130-L143">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.register-Tuple{Type{CUDA.HostMemory}, Ptr, Integer, Any}" href="#CUDA.register-Tuple{Type{CUDA.HostMemory}, Ptr, Integer, Any}"><code>CUDA.register</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">register(HostMemory, ptr::Ptr, bytesize::Integer, [flags])</code></pre><p>Page-lock the host memory pointed to by <code>ptr</code>. Subsequent transfers to and from devices will be faster, and can be executed asynchronously. If the <code>MEMHOSTREGISTER_DEVICEMAP</code> flag is specified, the buffer will also be accessible directly from the GPU. These accesses are direct, and go through the PCI bus. If the <code>MEMHOSTREGISTER_PORTABLE</code> flag is specified, any CUDA context can access the memory.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/memory.jl#L158-L166">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.unregister-Tuple{CUDA.HostMemory}" href="#CUDA.unregister-Tuple{CUDA.HostMemory}"><code>CUDA.unregister</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">unregister(::HostMemory)</code></pre><p>Unregisters a memory range that was registered with <a href="#CUDA.register-Tuple{Type{CUDA.HostMemory}, Ptr, Integer, Any}"><code>register</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/memory.jl#L175-L179">source</a></section></article><h3 id="Array-memory"><a class="docs-heading-anchor" href="#Array-memory">Array memory</a><a id="Array-memory-1"></a><a class="docs-heading-anchor-permalink" href="#Array-memory" title="Permalink"></a></h3><p>Array memory is a special kind of memory that is optimized for 2D and 3D access patterns. The memory is opaquely managed by the CUDA runtime, and is typically only used on combination with texture intrinsics.</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.ArrayMemory" href="#CUDA.ArrayMemory"><code>CUDA.ArrayMemory</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">ArrayMemory</code></pre><p>Array memory residing on the GPU, possibly in a specially-formatted way.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/memory.jl#L273-L277">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.alloc-Union{Tuple{T}, Tuple{Type{CUDA.ArrayMemory{T}}, Tuple{Vararg{Int64, N}} where N}} where T" href="#CUDA.alloc-Union{Tuple{T}, Tuple{Type{CUDA.ArrayMemory{T}}, Tuple{Vararg{Int64, N}} where N}} where T"><code>CUDA.alloc</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">alloc(ArrayMemory, dims::Dims)</code></pre><p>Allocate array memory with dimensions <code>dims</code>. The memory is accessible on the GPU, but can only be used in conjunction with special intrinsics (e.g., texture intrinsics).</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/memory.jl#L302-L307">source</a></section></article><h3 id="Pointers"><a class="docs-heading-anchor" href="#Pointers">Pointers</a><a id="Pointers-1"></a><a class="docs-heading-anchor-permalink" href="#Pointers" title="Permalink"></a></h3><p>To work with these buffers, you need to <code>convert</code> them to a <code>Ptr</code>, <code>CuPtr</code>, or in the case of <code>ArrayMemory</code> an <code>CuArrayPtr</code>. You can then use common Julia methods on these pointers, such as <code>unsafe_copyto!</code>. CUDA.jl also provides some specialized functionality that does not match standard Julia functionality:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.unsafe_copy2d!" href="#CUDA.unsafe_copy2d!"><code>CUDA.unsafe_copy2d!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">unsafe_copy2d!(dst, dstTyp, src, srcTyp, width, height=1;
                dstPos=(1,1), dstPitch=0,
                srcPos=(1,1), srcPitch=0,
-               async=false, stream=nothing)</code></pre><p>Perform a 2D memory copy between pointers <code>src</code> and <code>dst</code>, at respectively position <code>srcPos</code> and <code>dstPos</code> (1-indexed). Pitch can be specified for both the source and destination; consult the CUDA documentation for more details. This call is executed asynchronously if <code>async</code> is set, otherwise <code>stream</code> is synchronized.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/memory.jl#L464-L474">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.unsafe_copy3d!" href="#CUDA.unsafe_copy3d!"><code>CUDA.unsafe_copy3d!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">unsafe_copy3d!(dst, dstTyp, src, srcTyp, width, height=1, depth=1;
+               async=false, stream=nothing)</code></pre><p>Perform a 2D memory copy between pointers <code>src</code> and <code>dst</code>, at respectively position <code>srcPos</code> and <code>dstPos</code> (1-indexed). Pitch can be specified for both the source and destination; consult the CUDA documentation for more details. This call is executed asynchronously if <code>async</code> is set, otherwise <code>stream</code> is synchronized.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/memory.jl#L464-L474">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.unsafe_copy3d!" href="#CUDA.unsafe_copy3d!"><code>CUDA.unsafe_copy3d!</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">unsafe_copy3d!(dst, dstTyp, src, srcTyp, width, height=1, depth=1;
                dstPos=(1,1,1), dstPitch=0, dstHeight=0,
                srcPos=(1,1,1), srcPitch=0, srcHeight=0,
-               async=false, stream=nothing)</code></pre><p>Perform a 3D memory copy between pointers <code>src</code> and <code>dst</code>, at respectively position <code>srcPos</code> and <code>dstPos</code> (1-indexed). Both pitch and height can be specified for both the source and destination; consult the CUDA documentation for more details. This call is executed asynchronously if <code>async</code> is set, otherwise <code>stream</code> is synchronized.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/memory.jl#L547-L557">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.memset" href="#CUDA.memset"><code>CUDA.memset</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">memset(mem::CuPtr, value::Union{UInt8,UInt16,UInt32}, len::Integer; [stream::CuStream])</code></pre><p>Initialize device memory by copying <code>val</code> for <code>len</code> times.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/memory.jl#L387-L391">source</a></section></article><h3 id="Other"><a class="docs-heading-anchor" href="#Other">Other</a><a id="Other-1"></a><a class="docs-heading-anchor-permalink" href="#Other" title="Permalink"></a></h3><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.free_memory" href="#CUDA.free_memory"><code>CUDA.free_memory</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">free_memory()</code></pre><p>Returns the free amount of memory (in bytes), available for allocation by the CUDA context.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/memory.jl#L863-L867">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.total_memory" href="#CUDA.total_memory"><code>CUDA.total_memory</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">total_memory()</code></pre><p>Returns the total amount of memory (in bytes), available for allocation by the CUDA context.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/memory.jl#L870-L874">source</a></section></article><h2 id="Stream-Management"><a class="docs-heading-anchor" href="#Stream-Management">Stream Management</a><a id="Stream-Management-1"></a><a class="docs-heading-anchor-permalink" href="#Stream-Management" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuStream" href="#CUDA.CuStream"><code>CUDA.CuStream</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuStream(; flags=STREAM_DEFAULT, priority=nothing)</code></pre><p>Create a CUDA stream.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/stream.jl#L7-L11">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.isdone-Tuple{CuStream}" href="#CUDA.isdone-Tuple{CuStream}"><code>CUDA.isdone</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">isdone(s::CuStream)</code></pre><p>Return <code>false</code> if a stream is busy (has task running or queued) and <code>true</code> if that stream is free.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/stream.jl#L121-L126">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.priority_range" href="#CUDA.priority_range"><code>CUDA.priority_range</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">priority_range()</code></pre><p>Return the valid range of stream priorities as a <code>StepRange</code> (with step size  1). The lower bound of the range denotes the least priority (typically 0), with the upper bound representing the greatest possible priority (typically -1).</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/stream.jl#L148-L154">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.priority" href="#CUDA.priority"><code>CUDA.priority</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">priority_range(s::CuStream)</code></pre><p>Return the priority of a stream <code>s</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/stream.jl#L164-L168">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.synchronize-Tuple{CuStream}" href="#CUDA.synchronize-Tuple{CuStream}"><code>CUDA.synchronize</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">synchronize([stream::CuStream])</code></pre><p>Wait until <code>stream</code> has finished executing, with <code>stream</code> defaulting to the stream associated with the current Julia task.</p><p>See also: <a href="#CUDA.device_synchronize"><code>device_synchronize</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/stream.jl#L138-L145">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.@sync" href="#CUDA.@sync"><code>CUDA.@sync</code></a> — <span class="docstring-category">Macro</span></header><section><div><pre><code class="language-julia hljs">@sync [blocking=false] ex</code></pre><p>Run expression <code>ex</code> and synchronize the GPU afterwards.</p><p>The <code>blocking</code> keyword argument determines how synchronization is performed. By default, non-blocking synchronization will be used, which gives other Julia tasks a chance to run while waiting for the GPU to finish. This may increase latency, so for short operations, or when benchmaring code that does not use multiple tasks, it may be beneficial to use blocking synchronization instead by setting <code>blocking=true</code>. Blocking synchronization can also be enabled globally by changing the <code>nonblocking_synchronization</code> preference.</p><p>See also: <a href="#CUDA.synchronize-Tuple{CuContext}"><code>synchronize</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/utilities.jl#L1-L14">source</a></section></article><p>For specific use cases, special streams are available:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.default_stream" href="#CUDA.default_stream"><code>CUDA.default_stream</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">default_stream()</code></pre><p>Return the default stream.</p><div class="admonition is-info"><header class="admonition-header">Note</header><div class="admonition-body"><p>It is generally better to use <code>stream()</code> to get a stream object that&#39;s local to the current task. That way, operations scheduled in other tasks can overlap.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/stream.jl#L41-L50">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.legacy_stream" href="#CUDA.legacy_stream"><code>CUDA.legacy_stream</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">legacy_stream()</code></pre><p>Return a special object to use use an implicit stream with legacy synchronization behavior.</p><p>You can use this stream to perform operations that should block on all streams (with the exception of streams created with <code>STREAM_NON_BLOCKING</code>). This matches the old pre-CUDA 7 global stream behavior.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/stream.jl#L53-L61">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.per_thread_stream" href="#CUDA.per_thread_stream"><code>CUDA.per_thread_stream</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">per_thread_stream()</code></pre><p>Return a special object to use an implicit stream with per-thread synchronization behavior. This stream object is normally meant to be used with APIs that do not have per-thread versions of their APIs (i.e. without a <code>ptsz</code> or <code>ptds</code> suffix).</p><div class="admonition is-info"><header class="admonition-header">Note</header><div class="admonition-body"><p>It is generally not needed to use this type of stream. With CUDA.jl, each task already gets its own non-blocking stream, and multithreading in Julia is typically accomplished using tasks.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/stream.jl#L64-L76">source</a></section></article><h2 id="Event-Management"><a class="docs-heading-anchor" href="#Event-Management">Event Management</a><a id="Event-Management-1"></a><a class="docs-heading-anchor-permalink" href="#Event-Management" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuEvent" href="#CUDA.CuEvent"><code>CUDA.CuEvent</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuEvent()</code></pre><p>Create a new CUDA event.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/events.jl#L8-L12">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.record" href="#CUDA.record"><code>CUDA.record</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">record(e::CuEvent, [stream::CuStream])</code></pre><p>Record an event on a stream.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/events.jl#L39-L43">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.synchronize-Tuple{CuEvent}" href="#CUDA.synchronize-Tuple{CuEvent}"><code>CUDA.synchronize</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">synchronize(e::CuEvent)</code></pre><p>Waits for an event to complete.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/events.jl#L47-L51">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.isdone-Tuple{CuEvent}" href="#CUDA.isdone-Tuple{CuEvent}"><code>CUDA.isdone</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">isdone(e::CuEvent)</code></pre><p>Return <code>false</code> if there is outstanding work preceding the most recent call to <code>record(e)</code> and <code>true</code> if all captured work has been completed.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/events.jl#L54-L59">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.wait-Tuple{CuEvent}" href="#CUDA.wait-Tuple{CuEvent}"><code>CUDA.wait</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">wait(e::CuEvent, [stream::CuStream])</code></pre><p>Make a stream wait on a event. This only makes the stream wait, and not the host; use <a href="#CUDA.synchronize-Tuple{CuEvent}"><code>synchronize(::CuEvent)</code></a> for that.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/events.jl#L71-L76">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.elapsed" href="#CUDA.elapsed"><code>CUDA.elapsed</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">elapsed(start::CuEvent, stop::CuEvent)</code></pre><p>Computes the elapsed time between two events (in seconds).</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/events.jl#L80-L84">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.@elapsed" href="#CUDA.@elapsed"><code>CUDA.@elapsed</code></a> — <span class="docstring-category">Macro</span></header><section><div><pre><code class="language-julia hljs">@elapsed [blocking=false] ex</code></pre><p>A macro to evaluate an expression, discarding the resulting value, instead returning the number of seconds it took to execute on the GPU, as a floating-point number.</p><p>See also: <a href="#CUDA.@sync"><code>@sync</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/events.jl#L91-L98">source</a></section></article><h2 id="Execution-Control"><a class="docs-heading-anchor" href="#Execution-Control">Execution Control</a><a id="Execution-Control-1"></a><a class="docs-heading-anchor-permalink" href="#Execution-Control" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuDim3" href="#CUDA.CuDim3"><code>CUDA.CuDim3</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuDim3(x)
+               async=false, stream=nothing)</code></pre><p>Perform a 3D memory copy between pointers <code>src</code> and <code>dst</code>, at respectively position <code>srcPos</code> and <code>dstPos</code> (1-indexed). Both pitch and height can be specified for both the source and destination; consult the CUDA documentation for more details. This call is executed asynchronously if <code>async</code> is set, otherwise <code>stream</code> is synchronized.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/memory.jl#L547-L557">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.memset" href="#CUDA.memset"><code>CUDA.memset</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">memset(mem::CuPtr, value::Union{UInt8,UInt16,UInt32}, len::Integer; [stream::CuStream])</code></pre><p>Initialize device memory by copying <code>val</code> for <code>len</code> times.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/memory.jl#L387-L391">source</a></section></article><h3 id="Other"><a class="docs-heading-anchor" href="#Other">Other</a><a id="Other-1"></a><a class="docs-heading-anchor-permalink" href="#Other" title="Permalink"></a></h3><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.free_memory" href="#CUDA.free_memory"><code>CUDA.free_memory</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">free_memory()</code></pre><p>Returns the free amount of memory (in bytes), available for allocation by the CUDA context.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/memory.jl#L863-L867">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.total_memory" href="#CUDA.total_memory"><code>CUDA.total_memory</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">total_memory()</code></pre><p>Returns the total amount of memory (in bytes), available for allocation by the CUDA context.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/memory.jl#L870-L874">source</a></section></article><h2 id="Stream-Management"><a class="docs-heading-anchor" href="#Stream-Management">Stream Management</a><a id="Stream-Management-1"></a><a class="docs-heading-anchor-permalink" href="#Stream-Management" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuStream" href="#CUDA.CuStream"><code>CUDA.CuStream</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuStream(; flags=STREAM_DEFAULT, priority=nothing)</code></pre><p>Create a CUDA stream.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/stream.jl#L7-L11">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.isdone-Tuple{CuStream}" href="#CUDA.isdone-Tuple{CuStream}"><code>CUDA.isdone</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">isdone(s::CuStream)</code></pre><p>Return <code>false</code> if a stream is busy (has task running or queued) and <code>true</code> if that stream is free.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/stream.jl#L121-L126">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.priority_range" href="#CUDA.priority_range"><code>CUDA.priority_range</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">priority_range()</code></pre><p>Return the valid range of stream priorities as a <code>StepRange</code> (with step size  1). The lower bound of the range denotes the least priority (typically 0), with the upper bound representing the greatest possible priority (typically -1).</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/stream.jl#L148-L154">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.priority" href="#CUDA.priority"><code>CUDA.priority</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">priority_range(s::CuStream)</code></pre><p>Return the priority of a stream <code>s</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/stream.jl#L164-L168">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.synchronize-Tuple{CuStream}" href="#CUDA.synchronize-Tuple{CuStream}"><code>CUDA.synchronize</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">synchronize([stream::CuStream])</code></pre><p>Wait until <code>stream</code> has finished executing, with <code>stream</code> defaulting to the stream associated with the current Julia task.</p><p>See also: <a href="#CUDA.device_synchronize"><code>device_synchronize</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/stream.jl#L138-L145">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.@sync" href="#CUDA.@sync"><code>CUDA.@sync</code></a> — <span class="docstring-category">Macro</span></header><section><div><pre><code class="language-julia hljs">@sync [blocking=false] ex</code></pre><p>Run expression <code>ex</code> and synchronize the GPU afterwards.</p><p>The <code>blocking</code> keyword argument determines how synchronization is performed. By default, non-blocking synchronization will be used, which gives other Julia tasks a chance to run while waiting for the GPU to finish. This may increase latency, so for short operations, or when benchmaring code that does not use multiple tasks, it may be beneficial to use blocking synchronization instead by setting <code>blocking=true</code>. Blocking synchronization can also be enabled globally by changing the <code>nonblocking_synchronization</code> preference.</p><p>See also: <a href="#CUDA.synchronize-Tuple{CuContext}"><code>synchronize</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/utilities.jl#L1-L14">source</a></section></article><p>For specific use cases, special streams are available:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.default_stream" href="#CUDA.default_stream"><code>CUDA.default_stream</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">default_stream()</code></pre><p>Return the default stream.</p><div class="admonition is-info"><header class="admonition-header">Note</header><div class="admonition-body"><p>It is generally better to use <code>stream()</code> to get a stream object that&#39;s local to the current task. That way, operations scheduled in other tasks can overlap.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/stream.jl#L41-L50">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.legacy_stream" href="#CUDA.legacy_stream"><code>CUDA.legacy_stream</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">legacy_stream()</code></pre><p>Return a special object to use use an implicit stream with legacy synchronization behavior.</p><p>You can use this stream to perform operations that should block on all streams (with the exception of streams created with <code>STREAM_NON_BLOCKING</code>). This matches the old pre-CUDA 7 global stream behavior.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/stream.jl#L53-L61">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.per_thread_stream" href="#CUDA.per_thread_stream"><code>CUDA.per_thread_stream</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">per_thread_stream()</code></pre><p>Return a special object to use an implicit stream with per-thread synchronization behavior. This stream object is normally meant to be used with APIs that do not have per-thread versions of their APIs (i.e. without a <code>ptsz</code> or <code>ptds</code> suffix).</p><div class="admonition is-info"><header class="admonition-header">Note</header><div class="admonition-body"><p>It is generally not needed to use this type of stream. With CUDA.jl, each task already gets its own non-blocking stream, and multithreading in Julia is typically accomplished using tasks.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/stream.jl#L64-L76">source</a></section></article><h2 id="Event-Management"><a class="docs-heading-anchor" href="#Event-Management">Event Management</a><a id="Event-Management-1"></a><a class="docs-heading-anchor-permalink" href="#Event-Management" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuEvent" href="#CUDA.CuEvent"><code>CUDA.CuEvent</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuEvent()</code></pre><p>Create a new CUDA event.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/events.jl#L8-L12">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.record" href="#CUDA.record"><code>CUDA.record</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">record(e::CuEvent, [stream::CuStream])</code></pre><p>Record an event on a stream.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/events.jl#L39-L43">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.synchronize-Tuple{CuEvent}" href="#CUDA.synchronize-Tuple{CuEvent}"><code>CUDA.synchronize</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">synchronize(e::CuEvent)</code></pre><p>Waits for an event to complete.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/events.jl#L47-L51">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.isdone-Tuple{CuEvent}" href="#CUDA.isdone-Tuple{CuEvent}"><code>CUDA.isdone</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">isdone(e::CuEvent)</code></pre><p>Return <code>false</code> if there is outstanding work preceding the most recent call to <code>record(e)</code> and <code>true</code> if all captured work has been completed.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/events.jl#L54-L59">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.wait-Tuple{CuEvent}" href="#CUDA.wait-Tuple{CuEvent}"><code>CUDA.wait</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">wait(e::CuEvent, [stream::CuStream])</code></pre><p>Make a stream wait on a event. This only makes the stream wait, and not the host; use <a href="#CUDA.synchronize-Tuple{CuEvent}"><code>synchronize(::CuEvent)</code></a> for that.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/events.jl#L71-L76">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.elapsed" href="#CUDA.elapsed"><code>CUDA.elapsed</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">elapsed(start::CuEvent, stop::CuEvent)</code></pre><p>Computes the elapsed time between two events (in seconds).</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/events.jl#L80-L84">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.@elapsed" href="#CUDA.@elapsed"><code>CUDA.@elapsed</code></a> — <span class="docstring-category">Macro</span></header><section><div><pre><code class="language-julia hljs">@elapsed [blocking=false] ex</code></pre><p>A macro to evaluate an expression, discarding the resulting value, instead returning the number of seconds it took to execute on the GPU, as a floating-point number.</p><p>See also: <a href="#CUDA.@sync"><code>@sync</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/events.jl#L91-L98">source</a></section></article><h2 id="Execution-Control"><a class="docs-heading-anchor" href="#Execution-Control">Execution Control</a><a id="Execution-Control-1"></a><a class="docs-heading-anchor-permalink" href="#Execution-Control" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuDim3" href="#CUDA.CuDim3"><code>CUDA.CuDim3</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuDim3(x)
 
 CuDim3((x,))
 CuDim3((x, y))
-CuDim3((x, y, x))</code></pre><p>A type used to specify dimensions, consisting of 3 integers for respectively the <code>x</code>, <code>y</code> and <code>z</code> dimension. Unspecified dimensions default to <code>1</code>.</p><p>Often accepted as argument through the <code>CuDim</code> type alias, eg. in the case of <a href="#CUDA.cudacall"><code>cudacall</code></a> or <a href="#CUDA.launch"><code>CUDA.launch</code></a>, allowing to pass dimensions as a plain integer or a tuple without having to construct an explicit <code>CuDim3</code> object.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/types.jl#L3-L16">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.cudacall" href="#CUDA.cudacall"><code>CUDA.cudacall</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">cudacall(f, types, values...; blocks::CuDim, threads::CuDim,
+CuDim3((x, y, x))</code></pre><p>A type used to specify dimensions, consisting of 3 integers for respectively the <code>x</code>, <code>y</code> and <code>z</code> dimension. Unspecified dimensions default to <code>1</code>.</p><p>Often accepted as argument through the <code>CuDim</code> type alias, eg. in the case of <a href="#CUDA.cudacall"><code>cudacall</code></a> or <a href="#CUDA.launch"><code>CUDA.launch</code></a>, allowing to pass dimensions as a plain integer or a tuple without having to construct an explicit <code>CuDim3</code> object.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/types.jl#L3-L16">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.cudacall" href="#CUDA.cudacall"><code>CUDA.cudacall</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">cudacall(f, types, values...; blocks::CuDim, threads::CuDim,
          cooperative=false, shmem=0, stream=stream())</code></pre><p><code>ccall</code>-like interface for launching a CUDA function <code>f</code> on a GPU.</p><p>For example:</p><pre><code class="nohighlight hljs">vadd = CuFunction(md, &quot;vadd&quot;)
 a = rand(Float32, 10)
 b = rand(Float32, 10)
@@ -32,13 +32,13 @@
 cd = alloc(CUDA.DeviceMemory, 10*sizeof(Float32))
 
 cudacall(vadd, (CuPtr{Cfloat},CuPtr{Cfloat},CuPtr{Cfloat}), ad, bd, cd; threads=10)
-unsafe_copyto!(convert(Ptr{Cvoid}, c), cd, 10*sizeof(Float32)))</code></pre><p>The <code>blocks</code> and <code>threads</code> arguments control the launch configuration, and should both consist of either an integer, or a tuple of 1 to 3 integers (omitted dimensions default to 1). The <code>types</code> argument can contain both a tuple of types, and a tuple type, the latter being slightly faster.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/execution.jl#L156-L181">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.launch" href="#CUDA.launch"><code>CUDA.launch</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">launch(f::CuFunction; args...; blocks::CuDim=1, threads::CuDim=1,
-       cooperative=false, shmem=0, stream=stream())</code></pre><p>Low-level call to launch a CUDA function <code>f</code> on the GPU, using <code>blocks</code> and <code>threads</code> as respectively the grid and block configuration. Dynamic shared memory is allocated according to <code>shmem</code>, and the kernel is launched on stream <code>stream</code>.</p><p>Arguments to a kernel should either be bitstype, in which case they will be copied to the internal kernel parameter buffer, or a pointer to device memory.</p><p>This is a low-level call, prefer to use <a href="#CUDA.cudacall"><code>cudacall</code></a> instead.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/execution.jl#L39-L51">source</a></section><section><div><pre><code class="language-julia hljs">launch(exec::CuGraphExec, [stream::CuStream])</code></pre><p>Launches an executable graph, by default in the currently-active stream.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/graph.jl#L138-L142">source</a></section></article><h2 id="Profiler-Control"><a class="docs-heading-anchor" href="#Profiler-Control">Profiler Control</a><a id="Profiler-Control-1"></a><a class="docs-heading-anchor-permalink" href="#Profiler-Control" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.@profile" href="#CUDA.@profile"><code>CUDA.@profile</code></a> — <span class="docstring-category">Macro</span></header><section><div><pre><code class="language-julia hljs">@profile [trace=false] [raw=false] code...
-@profile external=true code...</code></pre><p>Profile the GPU execution of <code>code</code>.</p><p>There are two modes of operation, depending on whether <code>external</code> is <code>true</code> or <code>false</code>. The default value depends on whether Julia is being run under an external profiler.</p><p><strong>Integrated profiler (<code>external=false</code>, the default)</strong></p><p>In this mode, CUDA.jl will profile the execution of <code>code</code> and display the result. By default, a summary of host and device-side execution will be show, including any NVTX events. To display a chronological trace of the captured activity instead, <code>trace</code> can be set to <code>true</code>. Trace output will include an ID column that can be used to match host-side and device-side activity. If <code>raw</code> is <code>true</code>, all data will always be included, even if it may not be relevant. The output will be written to <code>io</code>, which defaults to <code>stdout</code>.</p><p>Slow operations will be highlighted in the output: Entries colored in yellow are among the slowest 25%, while entries colored in red are among the slowest 5% of all operations.</p><p>!!! compat &quot;Julia 1.9&quot; This functionality is only available on Julia 1.9 and later.</p><p>!!! compat &quot;CUDA 11.2&quot; Older versions of CUDA, before 11.2, contain bugs that may prevent     the <code>CUDA.@profile</code> macro to work. It is recommended to use a newer runtime.</p><p><strong>External profilers (<code>external=true</code>, when an external profiler is detected)</strong></p><p>For more advanced profiling, it is possible to use an external profiling tool, such as NSight Systems or NSight Compute. When doing so, it is often advisable to only enable the profiler for the specific code region of interest. This can be done by wrapping the code with <code>CUDA.@profile external=true</code>, which used to be the only way to use this macro.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/profile.jl#L3-L35">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.Profile.start" href="#CUDA.Profile.start"><code>CUDA.Profile.start</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">start()</code></pre><p>Enables profile collection by the active profiling tool for the current context. If profiling is already enabled, then this call has no effect.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/profile.jl#L204-L209">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.Profile.stop" href="#CUDA.Profile.stop"><code>CUDA.Profile.stop</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">stop()</code></pre><p>Disables profile collection by the active profiling tool for the current context. If profiling is already disabled, then this call has no effect.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/profile.jl#L219-L224">source</a></section></article><h2 id="Texture-Memory"><a class="docs-heading-anchor" href="#Texture-Memory">Texture Memory</a><a id="Texture-Memory-1"></a><a class="docs-heading-anchor-permalink" href="#Texture-Memory" title="Permalink"></a></h2><p>Textures are represented by objects of type <code>CuTexture</code> which are bound to some underlying memory, either <code>CuArray</code>s or <code>CuTextureArray</code>s:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuTexture" href="#CUDA.CuTexture"><code>CUDA.CuTexture</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuTexture{T,N,P}</code></pre><p><code>N</code>-dimensional texture object with elements of type <code>T</code>. These objects do not store data themselves, but are bounds to another source of device memory. Texture objects can be passed to CUDA kernels, where they will be accessible through the <a href="../../api/kernel/#CUDA.CuDeviceTexture"><code>CuDeviceTexture</code></a> type.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>Experimental API. Subject to change without deprecation.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/texture.jl#L142-L151">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuTexture-Tuple{Any}" href="#CUDA.CuTexture-Tuple{Any}"><code>CUDA.CuTexture</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">CuTexture{T,N,P}(parent::P; address_mode, filter_mode, normalized_coordinates)</code></pre><p>Construct a <code>N</code>-dimensional texture object with elements of type <code>T</code> as stored in <code>parent</code>.</p><p>Several keyword arguments alter the behavior of texture objects:</p><ul><li><code>address_mode</code> (wrap, <em>clamp</em>, mirror): how out-of-bounds values are accessed. Can be specified as a value for all dimensions, or as a tuple of <code>N</code> entries.</li><li><code>interpolation</code> (<em>nearest neighbour</em>, linear, bilinear): how non-integral indices are fetched. Nearest-neighbour fetches a single value, others interpolate between multiple.</li><li><code>normalized_coordinates</code> (true, <em>false</em>): whether indices are expected to fall in the normalized <code>[0:1)</code> range.</li></ul><p>!!! warning Experimental API. Subject to change without deprecation.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/texture.jl#L161-L176">source</a></section><section><div><pre><code class="language-julia hljs">CuTexture(x::CuTextureArray{T,N})</code></pre><p>Create a <code>N</code>-dimensional texture object withelements of type <code>T</code> that will be read from <code>x</code>.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>Experimental API. Subject to change without deprecation.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/texture.jl#L293-L300">source</a></section><section><div><pre><code class="language-julia hljs">CuTexture(x::CuArray{T,N})</code></pre><p>Create a <code>N</code>-dimensional texture object that reads from a <code>CuArray</code>.</p><p>Note that it is necessary the their memory is well aligned and strided (good pitch). Currently, that is not being enforced.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>Experimental API. Subject to change without deprecation.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/texture.jl#L304-L314">source</a></section></article><p>You can create <code>CuTextureArray</code> objects from both host and device memory:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuTextureArray" href="#CUDA.CuTextureArray"><code>CUDA.CuTextureArray</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuTextureArray{T,N}(undef, dims)</code></pre><p><code>N</code>-dimensional dense texture array with elements of type <code>T</code>. These arrays are optimized for texture fetching, and are only meant to be used as a source for <a href="#CUDA.CuTexture"><code>CuTexture{T,N,P}</code></a> objects.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>Experimental API. Subject to change without deprecation.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/texture.jl#L14-L23">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuTextureArray-Tuple{Any}" href="#CUDA.CuTextureArray-Tuple{Any}"><code>CUDA.CuTextureArray</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">CuTextureArray(A::AbstractArray)</code></pre><p>Allocate and initialize a texture array from host memory in <code>A</code>.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>Experimental API. Subject to change without deprecation.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/texture.jl#L70-L77">source</a></section><section><div><pre><code class="language-julia hljs">CuTextureArray(A::CuArray)</code></pre><p>Allocate and initialize a texture array from device memory in <code>A</code>.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>Experimental API. Subject to change without deprecation.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/src/texture.jl#L84-L91">source</a></section></article><h2 id="Occupancy-API"><a class="docs-heading-anchor" href="#Occupancy-API">Occupancy API</a><a id="Occupancy-API-1"></a><a class="docs-heading-anchor-permalink" href="#Occupancy-API" title="Permalink"></a></h2><p>The occupancy API can be used to figure out an appropriate launch configuration for a compiled kernel (represented as a <code>CuFunction</code>) on the current device:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.launch_configuration" href="#CUDA.launch_configuration"><code>CUDA.launch_configuration</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">launch_configuration(fun::CuFunction; shmem=0, max_threads=0)</code></pre><p>Calculate a suggested launch configuration for kernel <code>fun</code> requiring <code>shmem</code> bytes of dynamic shared memory. Returns a tuple with a suggested amount of threads, and the minimal amount of blocks to reach maximal occupancy. Optionally, the maximum amount of threads can be constrained using <code>max_threads</code>.</p><p>In the case of a variable amount of shared memory, pass a callable object for <code>shmem</code> instead, taking a single integer representing the block size and returning the amount of dynamic shared memory for that configuration.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/occupancy.jl#L44-L55">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.active_blocks" href="#CUDA.active_blocks"><code>CUDA.active_blocks</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">active_blocks(fun::CuFunction, threads; shmem=0)</code></pre><p>Calculate the maximum number of active blocks per multiprocessor when running <code>threads</code> threads of a kernel <code>fun</code> requiring <code>shmem</code> bytes of dynamic shared memory.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/occupancy.jl#L3-L8">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.occupancy" href="#CUDA.occupancy"><code>CUDA.occupancy</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">occupancy(fun::CuFunction, threads; shmem=0)</code></pre><p>Calculate the theoretical occupancy of launching <code>threads</code> threads of a kernel <code>fun</code> requiring <code>shmem</code> bytes of dynamic shared memory.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/occupancy.jl#L15-L21">source</a></section></article><h2 id="Graph-Execution"><a class="docs-heading-anchor" href="#Graph-Execution">Graph Execution</a><a id="Graph-Execution-1"></a><a class="docs-heading-anchor-permalink" href="#Graph-Execution" title="Permalink"></a></h2><p>CUDA graphs can be easily recorded and executed using the high-level <code>@captured</code> macro:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.@captured" href="#CUDA.@captured"><code>CUDA.@captured</code></a> — <span class="docstring-category">Macro</span></header><section><div><pre><code class="language-julia hljs">for ...
+unsafe_copyto!(convert(Ptr{Cvoid}, c), cd, 10*sizeof(Float32)))</code></pre><p>The <code>blocks</code> and <code>threads</code> arguments control the launch configuration, and should both consist of either an integer, or a tuple of 1 to 3 integers (omitted dimensions default to 1). The <code>types</code> argument can contain both a tuple of types, and a tuple type, the latter being slightly faster.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/execution.jl#L156-L181">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.launch" href="#CUDA.launch"><code>CUDA.launch</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">launch(f::CuFunction; args...; blocks::CuDim=1, threads::CuDim=1,
+       cooperative=false, shmem=0, stream=stream())</code></pre><p>Low-level call to launch a CUDA function <code>f</code> on the GPU, using <code>blocks</code> and <code>threads</code> as respectively the grid and block configuration. Dynamic shared memory is allocated according to <code>shmem</code>, and the kernel is launched on stream <code>stream</code>.</p><p>Arguments to a kernel should either be bitstype, in which case they will be copied to the internal kernel parameter buffer, or a pointer to device memory.</p><p>This is a low-level call, prefer to use <a href="#CUDA.cudacall"><code>cudacall</code></a> instead.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/execution.jl#L39-L51">source</a></section><section><div><pre><code class="language-julia hljs">launch(exec::CuGraphExec, [stream::CuStream])</code></pre><p>Launches an executable graph, by default in the currently-active stream.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/graph.jl#L138-L142">source</a></section></article><h2 id="Profiler-Control"><a class="docs-heading-anchor" href="#Profiler-Control">Profiler Control</a><a id="Profiler-Control-1"></a><a class="docs-heading-anchor-permalink" href="#Profiler-Control" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.@profile" href="#CUDA.@profile"><code>CUDA.@profile</code></a> — <span class="docstring-category">Macro</span></header><section><div><pre><code class="language-julia hljs">@profile [trace=false] [raw=false] code...
+@profile external=true code...</code></pre><p>Profile the GPU execution of <code>code</code>.</p><p>There are two modes of operation, depending on whether <code>external</code> is <code>true</code> or <code>false</code>. The default value depends on whether Julia is being run under an external profiler.</p><p><strong>Integrated profiler (<code>external=false</code>, the default)</strong></p><p>In this mode, CUDA.jl will profile the execution of <code>code</code> and display the result. By default, a summary of host and device-side execution will be show, including any NVTX events. To display a chronological trace of the captured activity instead, <code>trace</code> can be set to <code>true</code>. Trace output will include an ID column that can be used to match host-side and device-side activity. If <code>raw</code> is <code>true</code>, all data will always be included, even if it may not be relevant. The output will be written to <code>io</code>, which defaults to <code>stdout</code>.</p><p>Slow operations will be highlighted in the output: Entries colored in yellow are among the slowest 25%, while entries colored in red are among the slowest 5% of all operations.</p><p>!!! compat &quot;Julia 1.9&quot; This functionality is only available on Julia 1.9 and later.</p><p>!!! compat &quot;CUDA 11.2&quot; Older versions of CUDA, before 11.2, contain bugs that may prevent     the <code>CUDA.@profile</code> macro to work. It is recommended to use a newer runtime.</p><p><strong>External profilers (<code>external=true</code>, when an external profiler is detected)</strong></p><p>For more advanced profiling, it is possible to use an external profiling tool, such as NSight Systems or NSight Compute. When doing so, it is often advisable to only enable the profiler for the specific code region of interest. This can be done by wrapping the code with <code>CUDA.@profile external=true</code>, which used to be the only way to use this macro.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/profile.jl#L3-L35">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.Profile.start" href="#CUDA.Profile.start"><code>CUDA.Profile.start</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">start()</code></pre><p>Enables profile collection by the active profiling tool for the current context. If profiling is already enabled, then this call has no effect.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/profile.jl#L204-L209">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.Profile.stop" href="#CUDA.Profile.stop"><code>CUDA.Profile.stop</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">stop()</code></pre><p>Disables profile collection by the active profiling tool for the current context. If profiling is already disabled, then this call has no effect.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/profile.jl#L219-L224">source</a></section></article><h2 id="Texture-Memory"><a class="docs-heading-anchor" href="#Texture-Memory">Texture Memory</a><a id="Texture-Memory-1"></a><a class="docs-heading-anchor-permalink" href="#Texture-Memory" title="Permalink"></a></h2><p>Textures are represented by objects of type <code>CuTexture</code> which are bound to some underlying memory, either <code>CuArray</code>s or <code>CuTextureArray</code>s:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuTexture" href="#CUDA.CuTexture"><code>CUDA.CuTexture</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuTexture{T,N,P}</code></pre><p><code>N</code>-dimensional texture object with elements of type <code>T</code>. These objects do not store data themselves, but are bounds to another source of device memory. Texture objects can be passed to CUDA kernels, where they will be accessible through the <a href="../../api/kernel/#CUDA.CuDeviceTexture"><code>CuDeviceTexture</code></a> type.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>Experimental API. Subject to change without deprecation.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/texture.jl#L142-L151">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuTexture-Tuple{Any}" href="#CUDA.CuTexture-Tuple{Any}"><code>CUDA.CuTexture</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">CuTexture{T,N,P}(parent::P; address_mode, filter_mode, normalized_coordinates)</code></pre><p>Construct a <code>N</code>-dimensional texture object with elements of type <code>T</code> as stored in <code>parent</code>.</p><p>Several keyword arguments alter the behavior of texture objects:</p><ul><li><code>address_mode</code> (wrap, <em>clamp</em>, mirror): how out-of-bounds values are accessed. Can be specified as a value for all dimensions, or as a tuple of <code>N</code> entries.</li><li><code>interpolation</code> (<em>nearest neighbour</em>, linear, bilinear): how non-integral indices are fetched. Nearest-neighbour fetches a single value, others interpolate between multiple.</li><li><code>normalized_coordinates</code> (true, <em>false</em>): whether indices are expected to fall in the normalized <code>[0:1)</code> range.</li></ul><p>!!! warning Experimental API. Subject to change without deprecation.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/texture.jl#L161-L176">source</a></section><section><div><pre><code class="language-julia hljs">CuTexture(x::CuTextureArray{T,N})</code></pre><p>Create a <code>N</code>-dimensional texture object withelements of type <code>T</code> that will be read from <code>x</code>.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>Experimental API. Subject to change without deprecation.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/texture.jl#L293-L300">source</a></section><section><div><pre><code class="language-julia hljs">CuTexture(x::CuArray{T,N})</code></pre><p>Create a <code>N</code>-dimensional texture object that reads from a <code>CuArray</code>.</p><p>Note that it is necessary the their memory is well aligned and strided (good pitch). Currently, that is not being enforced.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>Experimental API. Subject to change without deprecation.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/texture.jl#L304-L314">source</a></section></article><p>You can create <code>CuTextureArray</code> objects from both host and device memory:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuTextureArray" href="#CUDA.CuTextureArray"><code>CUDA.CuTextureArray</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuTextureArray{T,N}(undef, dims)</code></pre><p><code>N</code>-dimensional dense texture array with elements of type <code>T</code>. These arrays are optimized for texture fetching, and are only meant to be used as a source for <a href="#CUDA.CuTexture"><code>CuTexture{T,N,P}</code></a> objects.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>Experimental API. Subject to change without deprecation.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/texture.jl#L14-L23">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuTextureArray-Tuple{Any}" href="#CUDA.CuTextureArray-Tuple{Any}"><code>CUDA.CuTextureArray</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">CuTextureArray(A::AbstractArray)</code></pre><p>Allocate and initialize a texture array from host memory in <code>A</code>.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>Experimental API. Subject to change without deprecation.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/texture.jl#L70-L77">source</a></section><section><div><pre><code class="language-julia hljs">CuTextureArray(A::CuArray)</code></pre><p>Allocate and initialize a texture array from device memory in <code>A</code>.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>Experimental API. Subject to change without deprecation.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/src/texture.jl#L84-L91">source</a></section></article><h2 id="Occupancy-API"><a class="docs-heading-anchor" href="#Occupancy-API">Occupancy API</a><a id="Occupancy-API-1"></a><a class="docs-heading-anchor-permalink" href="#Occupancy-API" title="Permalink"></a></h2><p>The occupancy API can be used to figure out an appropriate launch configuration for a compiled kernel (represented as a <code>CuFunction</code>) on the current device:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.launch_configuration" href="#CUDA.launch_configuration"><code>CUDA.launch_configuration</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">launch_configuration(fun::CuFunction; shmem=0, max_threads=0)</code></pre><p>Calculate a suggested launch configuration for kernel <code>fun</code> requiring <code>shmem</code> bytes of dynamic shared memory. Returns a tuple with a suggested amount of threads, and the minimal amount of blocks to reach maximal occupancy. Optionally, the maximum amount of threads can be constrained using <code>max_threads</code>.</p><p>In the case of a variable amount of shared memory, pass a callable object for <code>shmem</code> instead, taking a single integer representing the block size and returning the amount of dynamic shared memory for that configuration.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/occupancy.jl#L44-L55">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.active_blocks" href="#CUDA.active_blocks"><code>CUDA.active_blocks</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">active_blocks(fun::CuFunction, threads; shmem=0)</code></pre><p>Calculate the maximum number of active blocks per multiprocessor when running <code>threads</code> threads of a kernel <code>fun</code> requiring <code>shmem</code> bytes of dynamic shared memory.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/occupancy.jl#L3-L8">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.occupancy" href="#CUDA.occupancy"><code>CUDA.occupancy</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">occupancy(fun::CuFunction, threads; shmem=0)</code></pre><p>Calculate the theoretical occupancy of launching <code>threads</code> threads of a kernel <code>fun</code> requiring <code>shmem</code> bytes of dynamic shared memory.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/occupancy.jl#L15-L21">source</a></section></article><h2 id="Graph-Execution"><a class="docs-heading-anchor" href="#Graph-Execution">Graph Execution</a><a id="Graph-Execution-1"></a><a class="docs-heading-anchor-permalink" href="#Graph-Execution" title="Permalink"></a></h2><p>CUDA graphs can be easily recorded and executed using the high-level <code>@captured</code> macro:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.@captured" href="#CUDA.@captured"><code>CUDA.@captured</code></a> — <span class="docstring-category">Macro</span></header><section><div><pre><code class="language-julia hljs">for ...
     @captured begin
         # code that executes several kernels or CUDA operations
     end
-end</code></pre><p>A convenience macro for recording a graph of CUDA operations and automatically cache and update the execution. This can improve performance when executing kernels in a loop, where the launch overhead might dominate the execution.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>For this to be effective, the kernels and operations executed inside of the captured region should not signficantly change across iterations of the loop. It is allowed to, e.g., change kernel arguments or inputs to operations, as this will be processed by updating the cached executable graph. However, significant changes will result in an instantiation of the graph from scratch, which is an expensive operation.</p></div></div><p>See also: <a href="#CUDA.capture"><code>capture</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/graph.jl#L185-L205">source</a></section></article><p>Low-level operations are available too:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuGraph" href="#CUDA.CuGraph"><code>CUDA.CuGraph</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuGraph([flags])</code></pre><p>Create an empty graph for use with low-level graph operations. If you want to create a graph while directly recording operations, use <a href="#CUDA.capture"><code>capture</code></a>. For a high-level interface that also automatically executes the graph, use the <a href="#CUDA.@captured"><code>@captured</code></a> macro.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/graph.jl#L10-L16">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.capture" href="#CUDA.capture"><code>CUDA.capture</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">capture([flags], [throw_error::Bool=true]) do
+end</code></pre><p>A convenience macro for recording a graph of CUDA operations and automatically cache and update the execution. This can improve performance when executing kernels in a loop, where the launch overhead might dominate the execution.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>For this to be effective, the kernels and operations executed inside of the captured region should not signficantly change across iterations of the loop. It is allowed to, e.g., change kernel arguments or inputs to operations, as this will be processed by updating the cached executable graph. However, significant changes will result in an instantiation of the graph from scratch, which is an expensive operation.</p></div></div><p>See also: <a href="#CUDA.capture"><code>capture</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/graph.jl#L185-L205">source</a></section></article><p>Low-level operations are available too:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.CuGraph" href="#CUDA.CuGraph"><code>CUDA.CuGraph</code></a> — <span class="docstring-category">Type</span></header><section><div><pre><code class="language-julia hljs">CuGraph([flags])</code></pre><p>Create an empty graph for use with low-level graph operations. If you want to create a graph while directly recording operations, use <a href="#CUDA.capture"><code>capture</code></a>. For a high-level interface that also automatically executes the graph, use the <a href="#CUDA.@captured"><code>@captured</code></a> macro.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/graph.jl#L10-L16">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.capture" href="#CUDA.capture"><code>CUDA.capture</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">capture([flags], [throw_error::Bool=true]) do
     ...
-end</code></pre><p>Capture a graph of CUDA operations. The returned graph can then be instantiated and executed repeatedly for improved performance.</p><p>Note that many operations, like initial kernel compilation or memory allocations, cannot be captured. To work around this, you can set the <code>throw_error</code> keyword to false, which will cause this function to return <code>nothing</code> if such a failure happens. You can then try to evaluate the function in a regular way, and re-record afterwards.</p><p>See also: <a href="#CUDA.instantiate"><code>instantiate</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/graph.jl#L57-L71">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.instantiate" href="#CUDA.instantiate"><code>CUDA.instantiate</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">instantiate(graph::CuGraph)</code></pre><p>Creates an executable graph from a graph. This graph can then be launched, or updated with an other graph.</p><p>See also: <a href="#CUDA.launch"><code>launch</code></a>, <a href="#CUDA.update"><code>update</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/graph.jl#L120-L127">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.launch-Tuple{CuGraphExec}" href="#CUDA.launch-Tuple{CuGraphExec}"><code>CUDA.launch</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">launch(f::CuFunction; args...; blocks::CuDim=1, threads::CuDim=1,
-       cooperative=false, shmem=0, stream=stream())</code></pre><p>Low-level call to launch a CUDA function <code>f</code> on the GPU, using <code>blocks</code> and <code>threads</code> as respectively the grid and block configuration. Dynamic shared memory is allocated according to <code>shmem</code>, and the kernel is launched on stream <code>stream</code>.</p><p>Arguments to a kernel should either be bitstype, in which case they will be copied to the internal kernel parameter buffer, or a pointer to device memory.</p><p>This is a low-level call, prefer to use <a href="#CUDA.cudacall"><code>cudacall</code></a> instead.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/execution.jl#L39-L51">source</a></section><section><div><pre><code class="language-julia hljs">launch(exec::CuGraphExec, [stream::CuStream])</code></pre><p>Launches an executable graph, by default in the currently-active stream.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/graph.jl#L138-L142">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.update" href="#CUDA.update"><code>CUDA.update</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">update(exec::CuGraphExec, graph::CuGraph; [throw_error::Bool=true])</code></pre><p>Check whether an executable graph can be updated with a graph and perform the update if possible. Returns a boolean indicating whether the update was successful. Unless <code>throw_error</code> is set to false, also throws an error if the update failed.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/bd3b61be6a30b181607525e0e95b5cc4aa4ad381/lib/cudadrv/graph.jl#L147-L153">source</a></section></article></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../../api/compiler/">« Compiler</a><a class="docs-footer-nextpage" href="../../faq/">FAQ »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Monday 26 August 2024 07:38">Monday 26 August 2024</span>. Using Julia version 1.10.4.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+end</code></pre><p>Capture a graph of CUDA operations. The returned graph can then be instantiated and executed repeatedly for improved performance.</p><p>Note that many operations, like initial kernel compilation or memory allocations, cannot be captured. To work around this, you can set the <code>throw_error</code> keyword to false, which will cause this function to return <code>nothing</code> if such a failure happens. You can then try to evaluate the function in a regular way, and re-record afterwards.</p><p>See also: <a href="#CUDA.instantiate"><code>instantiate</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/graph.jl#L57-L71">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.instantiate" href="#CUDA.instantiate"><code>CUDA.instantiate</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">instantiate(graph::CuGraph)</code></pre><p>Creates an executable graph from a graph. This graph can then be launched, or updated with an other graph.</p><p>See also: <a href="#CUDA.launch"><code>launch</code></a>, <a href="#CUDA.update"><code>update</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/graph.jl#L120-L127">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.launch-Tuple{CuGraphExec}" href="#CUDA.launch-Tuple{CuGraphExec}"><code>CUDA.launch</code></a> — <span class="docstring-category">Method</span></header><section><div><pre><code class="language-julia hljs">launch(f::CuFunction; args...; blocks::CuDim=1, threads::CuDim=1,
+       cooperative=false, shmem=0, stream=stream())</code></pre><p>Low-level call to launch a CUDA function <code>f</code> on the GPU, using <code>blocks</code> and <code>threads</code> as respectively the grid and block configuration. Dynamic shared memory is allocated according to <code>shmem</code>, and the kernel is launched on stream <code>stream</code>.</p><p>Arguments to a kernel should either be bitstype, in which case they will be copied to the internal kernel parameter buffer, or a pointer to device memory.</p><p>This is a low-level call, prefer to use <a href="#CUDA.cudacall"><code>cudacall</code></a> instead.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/execution.jl#L39-L51">source</a></section><section><div><pre><code class="language-julia hljs">launch(exec::CuGraphExec, [stream::CuStream])</code></pre><p>Launches an executable graph, by default in the currently-active stream.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/graph.jl#L138-L142">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="CUDA.update" href="#CUDA.update"><code>CUDA.update</code></a> — <span class="docstring-category">Function</span></header><section><div><pre><code class="language-julia hljs">update(exec::CuGraphExec, graph::CuGraph; [throw_error::Bool=true])</code></pre><p>Check whether an executable graph can be updated with a graph and perform the update if possible. Returns a boolean indicating whether the update was successful. Unless <code>throw_error</code> is set to false, also throws an error if the update failed.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/CUDA.jl/blob/d72cdaa324cfadc1e67a7aa7fb9c4b035d2ec07c/lib/cudadrv/graph.jl#L147-L153">source</a></section></article></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../../api/compiler/">« Compiler</a><a class="docs-footer-nextpage" href="../../faq/">FAQ »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Wednesday 4 September 2024 18:14">Wednesday 4 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/tutorials/custom_structs/index.html b/dev/tutorials/custom_structs/index.html
index 6236189e34..49b006f95f 100644
--- a/dev/tutorials/custom_structs/index.html
+++ b/dev/tutorials/custom_structs/index.html
@@ -32,4 +32,4 @@
     Interpolate(xs, ys)
 end</code></pre><p>Now our struct plays nicely with CUDA.jl:</p><pre><code class="language-julia hljs">result = itp.(pts)</code></pre><pre class="documenter-example-output"><code class="nohighlight hljs ansi">2-element CuArray{Float64, 1, CUDA.DeviceMemory}:
  20.0
- 30.0</code></pre><p>It works, we get the same result as on the CPU.</p><pre><code class="language-julia hljs">@assert CuArray(result_cpu) == result</code></pre><p>Alternatively instead of defining <code>Adapt.adapt_structure</code> explictly, we could have done</p><pre><code class="language-julia hljs">Adapt.@adapt_structure Interpolate</code></pre><p>which expands to the same code that we wrote manually.</p><hr/><p><em>This page was generated using <a href="https://github.com/fredrikekre/Literate.jl">Literate.jl</a>.</em></p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../introduction/">« Introduction</a><a class="docs-footer-nextpage" href="../performance/">Performance Tips »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Monday 26 August 2024 07:38">Monday 26 August 2024</span>. Using Julia version 1.10.4.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+ 30.0</code></pre><p>It works, we get the same result as on the CPU.</p><pre><code class="language-julia hljs">@assert CuArray(result_cpu) == result</code></pre><p>Alternatively instead of defining <code>Adapt.adapt_structure</code> explictly, we could have done</p><pre><code class="language-julia hljs">Adapt.@adapt_structure Interpolate</code></pre><p>which expands to the same code that we wrote manually.</p><hr/><p><em>This page was generated using <a href="https://github.com/fredrikekre/Literate.jl">Literate.jl</a>.</em></p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../introduction/">« Introduction</a><a class="docs-footer-nextpage" href="../performance/">Performance Tips »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Wednesday 4 September 2024 18:14">Wednesday 4 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/tutorials/introduction/index.html b/dev/tutorials/introduction/index.html
index 8d801466b0..11b00ad0a3 100644
--- a/dev/tutorials/introduction/index.html
+++ b/dev/tutorials/introduction/index.html
@@ -89,45 +89,45 @@
         @cuda gpu_add1!(y, x)
     end
 end</code></pre><pre class="documenter-example-output"><code class="nohighlight hljs ansi">bench_gpu1! (generic function with 1 method)</code></pre><pre><code class="language-julia hljs">@btime bench_gpu1!($y_d, $x_d)</code></pre><pre><code class="nohighlight hljs">  119.783 ms (47 allocations: 1.23 KiB)</code></pre><p>That&#39;s a <em>lot</em> slower than the version above based on broadcasting. What happened?</p><h3 id="Profiling"><a class="docs-heading-anchor" href="#Profiling">Profiling</a><a id="Profiling-1"></a><a class="docs-heading-anchor-permalink" href="#Profiling" title="Permalink"></a></h3><p>When you don&#39;t get the performance you expect, usually your first step should be to profile the code and see where it&#39;s spending its time:</p><pre><code class="language-julia hljs">bench_gpu1!(y_d, x_d)  # run it once to force compilation
-CUDA.@profile bench_gpu1!(y_d, x_d)</code></pre><pre class="documenter-example-output"><code class="nohighlight hljs ansi">Profiler ran for 77.68 ms, capturing 804 events.
+CUDA.@profile bench_gpu1!(y_d, x_d)</code></pre><pre class="documenter-example-output"><code class="nohighlight hljs ansi">Profiler ran for 75.97 ms, capturing 804 events.
 
-Host-side activity: calling CUDA APIs took 76.45 ms (98.43% of the trace)
+Host-side activity: calling CUDA APIs took 74.96 ms (98.67% of the trace)
 ┌──────────┬────────────┬───────┬─────────────────────┐
 │<span class="sgr1"> Time (%) </span>│<span class="sgr1"> Total time </span>│<span class="sgr1"> Calls </span>│<span class="sgr1"> Name                </span>│
 ├──────────┼────────────┼───────┼─────────────────────┤
-│   98.42% │<span class="sgr31">   76.45 ms </span>│     1 │<span class="sgr1"> cuStreamSynchronize </span>│
-│    0.05% │<span class="sgr33">   37.91 µs </span>│     1 │<span class="sgr1"> cuLaunchKernel      </span>│
-│    0.00% │    2.86 µs │     1 │ cuCtxSetCurrent     │
-│    0.00% │  715.26 ns │     1 │ cuCtxGetDevice      │
-│    0.00% │  476.84 ns │     1 │ cuDeviceGetCount    │
+│   98.67% │<span class="sgr31">   74.95 ms </span>│     1 │<span class="sgr1"> cuStreamSynchronize </span>│
+│    0.06% │<span class="sgr33">   43.15 µs </span>│     1 │<span class="sgr1"> cuLaunchKernel      </span>│
+│    0.00% │    3.34 µs │     1 │ cuCtxSetCurrent     │
+│    0.00% │  953.67 ns │     1 │ cuCtxGetDevice      │
+│    0.00% │  953.67 ns │     1 │ cuDeviceGetCount    │
 └──────────┴────────────┴───────┴─────────────────────┘
 
-Device-side activity: GPU was busy for 77.28 ms (99.50% of the trace)
+Device-side activity: GPU was busy for 75.84 ms (99.83% of the trace)
 ┌──────────┬────────────┬───────┬───────────────────────────────────────────────
 │<span class="sgr1"> Time (%) </span>│<span class="sgr1"> Total time </span>│<span class="sgr1"> Calls </span>│<span class="sgr1"> Name                                        </span> ⋯
 ├──────────┼────────────┼───────┼───────────────────────────────────────────────
-│   99.50% │<span class="sgr31">   77.28 ms </span>│     1 │<span class="sgr1"> _Z9gpu_add1_13CuDeviceArrayI7Float32Ll1ELl1E</span> ⋯
+│   99.83% │<span class="sgr31">   75.84 ms </span>│     1 │<span class="sgr1"> _Z9gpu_add1_13CuDeviceArrayI7Float32Ll1ELl1E</span> ⋯
 └──────────┴────────────┴───────┴───────────────────────────────────────────────
 <span class="sgr36">                                                                1 column omitted</span>
-</code></pre><p>You can see that almost all of the time was spent in <code>ptxcall_gpu_add1__1</code>, the name of the kernel that CUDA.jl assigned when compiling <code>gpu_add1!</code> for these inputs. (Had you created arrays of multiple data types, e.g., <code>xu_d = CUDA.fill(0x01, N)</code>, you might have also seen <code>ptxcall_gpu_add1__2</code> and so on. Like the rest of Julia, you can define a single method and it will be specialized at compile time for the particular data types you&#39;re using.)</p><p>For further insight, run the profiling with the option <code>trace=true</code></p><pre><code class="language-julia hljs">CUDA.@profile trace=true bench_gpu1!(y_d, x_d)</code></pre><pre class="documenter-example-output"><code class="nohighlight hljs ansi">Profiler ran for 107.91 ms, capturing 804 events.
+</code></pre><p>You can see that almost all of the time was spent in <code>ptxcall_gpu_add1__1</code>, the name of the kernel that CUDA.jl assigned when compiling <code>gpu_add1!</code> for these inputs. (Had you created arrays of multiple data types, e.g., <code>xu_d = CUDA.fill(0x01, N)</code>, you might have also seen <code>ptxcall_gpu_add1__2</code> and so on. Like the rest of Julia, you can define a single method and it will be specialized at compile time for the particular data types you&#39;re using.)</p><p>For further insight, run the profiling with the option <code>trace=true</code></p><pre><code class="language-julia hljs">CUDA.@profile trace=true bench_gpu1!(y_d, x_d)</code></pre><pre class="documenter-example-output"><code class="nohighlight hljs ansi">Profiler ran for 107.86 ms, capturing 804 events.
 
-Host-side activity: calling CUDA APIs took 107.04 ms (99.19% of the trace)
+Host-side activity: calling CUDA APIs took 106.85 ms (99.06% of the trace)
 ┌─────┬───────────┬───────────┬────────┬─────────────────────┐
 │<span class="sgr1">  ID </span>│<span class="sgr1">     Start </span>│<span class="sgr1">      Time </span>│<span class="sgr1"> Thread </span>│<span class="sgr1"> Name                </span>│
 ├─────┼───────────┼───────────┼────────┼─────────────────────┤
-│  21 │  76.53 µs │<span class="sgr33">  34.09 µs </span>│      1 │<span class="sgr1"> cuLaunchKernel      </span>│
-│ 795 │ 822.07 µs │   2.62 µs │      2 │ cuCtxSetCurrent     │
-│ 796 │ 829.46 µs │ 715.26 ns │      2 │ cuCtxGetDevice      │
-│ 797 │  837.8 µs │ 715.26 ns │      2 │ cuDeviceGetCount    │
-│ 800 │ 846.15 µs │<span class="sgr31"> 107.04 ms </span>│      2 │<span class="sgr1"> cuStreamSynchronize </span>│
+│  21 │  46.97 µs │<span class="sgr33">   36.0 µs </span>│      1 │<span class="sgr1"> cuLaunchKernel      </span>│
+│ 795 │ 939.85 µs │   3.58 µs │      2 │ cuCtxSetCurrent     │
+│ 796 │ 950.34 µs │   1.19 µs │      2 │ cuCtxGetDevice      │
+│ 797 │ 963.45 µs │ 953.67 ns │      2 │ cuDeviceGetCount    │
+│ 800 │ 980.38 µs │<span class="sgr31"> 106.84 ms </span>│      2 │<span class="sgr1"> cuStreamSynchronize </span>│
 └─────┴───────────┴───────────┴────────┴─────────────────────┘
 
-Device-side activity: GPU was busy for 107.76 ms (99.86% of the trace)
-┌────┬───────────┬───────────┬─────────┬────────┬──────┬────────────────────────
-│<span class="sgr1"> ID </span>│<span class="sgr1">     Start </span>│<span class="sgr1">      Time </span>│<span class="sgr1"> Threads </span>│<span class="sgr1"> Blocks </span>│<span class="sgr1"> Regs </span>│<span class="sgr1"> Name                 </span> ⋯
-├────┼───────────┼───────────┼─────────┼────────┼──────┼────────────────────────
-│ 21 │ 113.01 µs │<span class="sgr31"> 107.76 ms </span>│       1 │      1 │   19 │<span class="sgr1"> _Z9gpu_add1_13CuDevic</span> ⋯
-└────┴───────────┴───────────┴─────────┴────────┴──────┴────────────────────────
+Device-side activity: GPU was busy for 107.73 ms (99.88% of the trace)
+┌────┬──────────┬───────────┬─────────┬────────┬──────┬─────────────────────────
+│<span class="sgr1"> ID </span>│<span class="sgr1">    Start </span>│<span class="sgr1">      Time </span>│<span class="sgr1"> Threads </span>│<span class="sgr1"> Blocks </span>│<span class="sgr1"> Regs </span>│<span class="sgr1"> Name                  </span> ⋯
+├────┼──────────┼───────────┼─────────┼────────┼──────┼─────────────────────────
+│ 21 │ 83.68 µs │<span class="sgr31"> 107.73 ms </span>│       1 │      1 │   19 │<span class="sgr1"> _Z9gpu_add1_13CuDevice</span> ⋯
+└────┴──────────┴───────────┴─────────┴────────┴──────┴─────────────────────────
 <span class="sgr36">                                                                1 column omitted</span>
 </code></pre><p>The key thing to note here is that we are only using a single block with a single thread. These terms will be explained shortly, but for now, suffice it to say that this is an indication that this computation ran sequentially. Of note, sequential processing with GPUs is much slower than with CPUs; where GPUs shine is with large-scale parallelism.</p><h3 id="Writing-a-parallel-GPU-kernel"><a class="docs-heading-anchor" href="#Writing-a-parallel-GPU-kernel">Writing a parallel GPU kernel</a><a id="Writing-a-parallel-GPU-kernel-1"></a><a class="docs-heading-anchor-permalink" href="#Writing-a-parallel-GPU-kernel" title="Permalink"></a></h3><p>To speed up the kernel, we want to parallelize it, which means assigning different tasks to different threads.  To facilitate the assignment of work, each CUDA thread gets access to variables that indicate its own unique identity, much as <a href="https://docs.julialang.org/en/latest/manual/parallel-computing/#Multi-Threading-(Experimental)-1"><code>Threads.threadid()</code></a> does for CPU threads. The CUDA analogs of <code>threadid</code> and <code>nthreads</code> are called <code>threadIdx</code> and <code>blockDim</code>, respectively; one difference is that these return a 3-dimensional structure with fields <code>x</code>, <code>y</code>, and <code>z</code> to simplify cartesian indexing for up to 3-dimensional arrays. Consequently we can assign unique work in the following way:</p><pre><code class="language-julia hljs">function gpu_add2!(y, x)
     index = threadIdx().x    # this example only requires linear indexing, so just use `x`
@@ -162,21 +162,21 @@
     CUDA.@sync begin
         @cuda threads=256 blocks=numblocks gpu_add3!(y, x)
     end
-end</code></pre><pre class="documenter-example-output"><code class="nohighlight hljs ansi">bench_gpu3! (generic function with 1 method)</code></pre><pre><code class="language-julia hljs">@btime bench_gpu3!($y_d, $x_d)</code></pre><pre><code class="nohighlight hljs">  67.268 μs (52 allocations: 1.31 KiB)</code></pre><p>Finally, we&#39;ve achieved the similar performance to what we got with the broadcasted version. Let&#39;s profile again to confirm this launch configuration:</p><pre><code class="language-julia hljs">CUDA.@profile trace=true bench_gpu3!(y_d, x_d)</code></pre><pre class="documenter-example-output"><code class="nohighlight hljs ansi">Profiler ran for 13.86 ms, capturing 302 events.
+end</code></pre><pre class="documenter-example-output"><code class="nohighlight hljs ansi">bench_gpu3! (generic function with 1 method)</code></pre><pre><code class="language-julia hljs">@btime bench_gpu3!($y_d, $x_d)</code></pre><pre><code class="nohighlight hljs">  67.268 μs (52 allocations: 1.31 KiB)</code></pre><p>Finally, we&#39;ve achieved the similar performance to what we got with the broadcasted version. Let&#39;s profile again to confirm this launch configuration:</p><pre><code class="language-julia hljs">CUDA.@profile trace=true bench_gpu3!(y_d, x_d)</code></pre><pre class="documenter-example-output"><code class="nohighlight hljs ansi">Profiler ran for 14.76 ms, capturing 296 events.
 
-Host-side activity: calling CUDA APIs took 98.47 µs (0.71% of the trace)
+Host-side activity: calling CUDA APIs took 101.57 µs (0.69% of the trace)
 ┌─────┬──────────┬──────────┬─────────────────────┐
 │<span class="sgr1">  ID </span>│<span class="sgr1">    Start </span>│<span class="sgr1">     Time </span>│<span class="sgr1"> Name                </span>│
 ├─────┼──────────┼──────────┼─────────────────────┤
-│  21 │ 13.66 ms │<span class="sgr31"> 41.72 µs </span>│<span class="sgr1"> cuLaunchKernel      </span>│
-│ 298 │ 13.84 ms │  5.48 µs │ cuStreamSynchronize │
+│  21 │ 14.55 ms │<span class="sgr31"> 46.73 µs </span>│<span class="sgr1"> cuLaunchKernel      </span>│
+│ 292 │ 14.74 ms │  5.25 µs │ cuStreamSynchronize │
 └─────┴──────────┴──────────┴─────────────────────┘
 
-Device-side activity: GPU was busy for 131.37 µs (0.95% of the trace)
+Device-side activity: GPU was busy for 131.61 µs (0.89% of the trace)
 ┌────┬─────────┬───────────┬─────────┬────────┬──────┬──────────────────────────
 │<span class="sgr1"> ID </span>│<span class="sgr1">   Start </span>│<span class="sgr1">      Time </span>│<span class="sgr1"> Threads </span>│<span class="sgr1"> Blocks </span>│<span class="sgr1"> Regs </span>│<span class="sgr1"> Name                   </span> ⋯
 ├────┼─────────┼───────────┼─────────┼────────┼──────┼──────────────────────────
-│ 21 │ 13.7 ms │<span class="sgr31"> 131.37 µs </span>│     256 │   4096 │   40 │<span class="sgr1"> _Z9gpu_add3_13CuDeviceA</span> ⋯
+│ 21 │ 14.6 ms │<span class="sgr31"> 131.61 µs </span>│     256 │   4096 │   40 │<span class="sgr1"> _Z9gpu_add3_13CuDeviceA</span> ⋯
 └────┴─────────┴───────────┴─────────┴────────┴──────┴──────────────────────────
 <span class="sgr36">                                                                1 column omitted</span>
 </code></pre><p>In the previous example, the number of threads was hard-coded to 256. This is not ideal, as using more threads generally improves performance, but the maximum number of allowed threads to launch depends on your GPU as well as on the kernel. To automatically select an appropriate number of threads, it is recommended to use the launch configuration API. This API takes a compiled (but not launched) kernel, returns a tuple with an upper bound on the number of threads, and the minimum number of blocks that are required to fully saturate the GPU:</p><pre><code class="language-julia hljs">kernel = @cuda launch=false gpu_add3!(y_d, x_d)
@@ -227,4 +227,4 @@
  [1] throw_boundserror at abstractarray.jl:484
  [2] checkbounds at abstractarray.jl:449
  [3] setindex! at /home/tbesard/Julia/CUDA/src/device/array.jl:79
- [4] some_kernel at /tmp/tmpIMYANH:6</code></pre><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>On older GPUs (with a compute capability below <code>sm_70</code>) these errors are fatal, and effectively kill the CUDA environment. On such GPUs, it&#39;s often a good idea to perform your &quot;sanity checks&quot; using code that runs on the CPU and only turn over the computation to the GPU once you&#39;ve deemed it to be safe.</p></div></div><h2 id="Summary"><a class="docs-heading-anchor" href="#Summary">Summary</a><a id="Summary-1"></a><a class="docs-heading-anchor-permalink" href="#Summary" title="Permalink"></a></h2><p>Keep in mind that the high-level functionality of CUDA often means that you don&#39;t need to worry about writing kernels at such a low level. However, there are many cases where computations can be optimized using clever low-level manipulations. Hopefully, you now feel comfortable taking the plunge.</p><hr/><p><em>This page was generated using <a href="https://github.com/fredrikekre/Literate.jl">Literate.jl</a>.</em></p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../../">« Home</a><a class="docs-footer-nextpage" href="../custom_structs/">Using custom structs »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Monday 26 August 2024 07:38">Monday 26 August 2024</span>. Using Julia version 1.10.4.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+ [4] some_kernel at /tmp/tmpIMYANH:6</code></pre><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>On older GPUs (with a compute capability below <code>sm_70</code>) these errors are fatal, and effectively kill the CUDA environment. On such GPUs, it&#39;s often a good idea to perform your &quot;sanity checks&quot; using code that runs on the CPU and only turn over the computation to the GPU once you&#39;ve deemed it to be safe.</p></div></div><h2 id="Summary"><a class="docs-heading-anchor" href="#Summary">Summary</a><a id="Summary-1"></a><a class="docs-heading-anchor-permalink" href="#Summary" title="Permalink"></a></h2><p>Keep in mind that the high-level functionality of CUDA often means that you don&#39;t need to worry about writing kernels at such a low level. However, there are many cases where computations can be optimized using clever low-level manipulations. Hopefully, you now feel comfortable taking the plunge.</p><hr/><p><em>This page was generated using <a href="https://github.com/fredrikekre/Literate.jl">Literate.jl</a>.</em></p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../../">« Home</a><a class="docs-footer-nextpage" href="../custom_structs/">Using custom structs »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Wednesday 4 September 2024 18:14">Wednesday 4 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/tutorials/performance/index.html b/dev/tutorials/performance/index.html
index ccd4c51492..dc8dc5f121 100644
--- a/dev/tutorials/performance/index.html
+++ b/dev/tutorials/performance/index.html
@@ -53,4 +53,4 @@
     blocks = cld(length(y), threads)
 
     CUDA.@sync kernel(y, x; threads, blocks)
-end</code></pre><pre class="documenter-example-output"><code class="nohighlight hljs ansi">bench_gpu5! (generic function with 1 method)</code></pre><pre><code class="language-julia hljs">@btime bench_gpu4!($y_d, $x_d)</code></pre><pre><code class="nohighlight hljs">  76.149 ms (57 allocations: 3.70 KiB)</code></pre><pre><code class="language-julia hljs">@btime bench_gpu5!($y_d, $x_d)</code></pre><pre><code class="nohighlight hljs">  75.732 ms (58 allocations: 3.73 KiB)</code></pre><p>This benchmark shows there is a only a small performance benefit for this kernel however we can see a big difference in the amount of registers used, recalling that 28 registers were used when using a <code>StepRange</code>:</p><pre><code class="language-julia hljs">CUDA.registers(@cuda gpu_add5!(y_d, x_d))</code></pre><pre><code class="nohighlight hljs">  12</code></pre><hr/><p><em>This page was generated using <a href="https://github.com/fredrikekre/Literate.jl">Literate.jl</a>.</em></p><section class="footnotes is-size-7"><ul><li class="footnote" id="footnote-1"><a class="tag is-link" href="#citeref-1">1</a>Conducted on Julia Version 1.9.2, the benefit of this technique should be reduced on version 1.10 or by using <code>always_inline=true</code> on the <code>@cuda</code> macro, e.g. <code>@cuda always_inline=true launch=false gpu_add4!(y, x)</code>.</li></ul></section></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../custom_structs/">« Using custom structs</a><a class="docs-footer-nextpage" href="../../installation/overview/">Overview »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Monday 26 August 2024 07:38">Monday 26 August 2024</span>. Using Julia version 1.10.4.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+end</code></pre><pre class="documenter-example-output"><code class="nohighlight hljs ansi">bench_gpu5! (generic function with 1 method)</code></pre><pre><code class="language-julia hljs">@btime bench_gpu4!($y_d, $x_d)</code></pre><pre><code class="nohighlight hljs">  76.149 ms (57 allocations: 3.70 KiB)</code></pre><pre><code class="language-julia hljs">@btime bench_gpu5!($y_d, $x_d)</code></pre><pre><code class="nohighlight hljs">  75.732 ms (58 allocations: 3.73 KiB)</code></pre><p>This benchmark shows there is a only a small performance benefit for this kernel however we can see a big difference in the amount of registers used, recalling that 28 registers were used when using a <code>StepRange</code>:</p><pre><code class="language-julia hljs">CUDA.registers(@cuda gpu_add5!(y_d, x_d))</code></pre><pre><code class="nohighlight hljs">  12</code></pre><hr/><p><em>This page was generated using <a href="https://github.com/fredrikekre/Literate.jl">Literate.jl</a>.</em></p><section class="footnotes is-size-7"><ul><li class="footnote" id="footnote-1"><a class="tag is-link" href="#citeref-1">1</a>Conducted on Julia Version 1.9.2, the benefit of this technique should be reduced on version 1.10 or by using <code>always_inline=true</code> on the <code>@cuda</code> macro, e.g. <code>@cuda always_inline=true launch=false gpu_add4!(y, x)</code>.</li></ul></section></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../custom_structs/">« Using custom structs</a><a class="docs-footer-nextpage" href="../../installation/overview/">Overview »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Wednesday 4 September 2024 18:14">Wednesday 4 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/usage/array/index.html b/dev/usage/array/index.html
index 6a9fb27230..a5b97fd9f4 100644
--- a/dev/usage/array/index.html
+++ b/dev/usage/array/index.html
@@ -250,4 +250,4 @@
 julia&gt; fft(a)
 2×2 CuArray{ComplexF32, 2, CUDA.DeviceMemory}:
    2.6692+0.0im   0.65323+0.0im
- -1.11072+0.0im  0.749168+0.0im</code></pre></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../workflow/">« Workflow</a><a class="docs-footer-nextpage" href="../memory/">Memory management »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Monday 26 August 2024 07:38">Monday 26 August 2024</span>. Using Julia version 1.10.4.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+ -1.11072+0.0im  0.749168+0.0im</code></pre></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../workflow/">« Workflow</a><a class="docs-footer-nextpage" href="../memory/">Memory management »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Wednesday 4 September 2024 18:14">Wednesday 4 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/usage/memory/index.html b/dev/usage/memory/index.html
index 8ab12518f2..34a0ed7a47 100644
--- a/dev/usage/memory/index.html
+++ b/dev/usage/memory/index.html
@@ -91,4 +91,4 @@
          println(&quot;Batch $batch: &quot;, a .+ b)
        end
 Batch 1: [3]
-Batch 2: [7]</code></pre><p>For each batch, every argument (assumed to be an array-like) is uploaded to the GPU using the <code>adapt</code> mechanism from above. Afterwards, the memory is eagerly put back in the CUDA memory pool using <code>unsafe_free!</code> to lower GC pressure.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../array/">« Array programming</a><a class="docs-footer-nextpage" href="../multitasking/">Tasks and threads »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Monday 26 August 2024 07:38">Monday 26 August 2024</span>. Using Julia version 1.10.4.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+Batch 2: [7]</code></pre><p>For each batch, every argument (assumed to be an array-like) is uploaded to the GPU using the <code>adapt</code> mechanism from above. Afterwards, the memory is eagerly put back in the CUDA memory pool using <code>unsafe_free!</code> to lower GC pressure.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../array/">« Array programming</a><a class="docs-footer-nextpage" href="../multitasking/">Tasks and threads »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Wednesday 4 September 2024 18:14">Wednesday 4 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/usage/multigpu/index.html b/dev/usage/multigpu/index.html
index efb6ef7080..a16859a3cf 100644
--- a/dev/usage/multigpu/index.html
+++ b/dev/usage/multigpu/index.html
@@ -46,4 +46,4 @@
 
 using Test
 c = Array(d_c)
-@test a+b ≈ c</code></pre></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../multitasking/">« Tasks and threads</a><a class="docs-footer-nextpage" href="../../development/profiling/">Benchmarking &amp; profiling »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Monday 26 August 2024 07:38">Monday 26 August 2024</span>. Using Julia version 1.10.4.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+@test a+b ≈ c</code></pre></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../multitasking/">« Tasks and threads</a><a class="docs-footer-nextpage" href="../../development/profiling/">Benchmarking &amp; profiling »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Wednesday 4 September 2024 18:14">Wednesday 4 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/usage/multitasking/index.html b/dev/usage/multitasking/index.html
index c4f202afd2..5a25503755 100644
--- a/dev/usage/multitasking/index.html
+++ b/dev/usage/multitasking/index.html
@@ -73,4 +73,4 @@
 
     # comparison
     results[1] == results[2]
-end</code></pre><p>By using the <code>Threads.@spawn</code> macro, the tasks will be scheduled to be run on different CPU threads. This can be useful when you are calling a lot of operations that &quot;block&quot; in CUDA, e.g., memory copies to or from unpinned memory. The same result will occur when using a <code>Threads.@threads for ... end</code> block. Generally, though, operations that synchronize GPU execution (including the call to <code>synchronize</code> itself) are implemented in a way that they yield back to the Julia scheduler, to enable concurrent execution without requiring the use of different CPU threads.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>Use of multiple threads with CUDA.jl is a recent addition, and there may still be bugs or performance issues.</p></div></div></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../memory/">« Memory management</a><a class="docs-footer-nextpage" href="../multigpu/">Multiple GPUs »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Monday 26 August 2024 07:38">Monday 26 August 2024</span>. Using Julia version 1.10.4.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+end</code></pre><p>By using the <code>Threads.@spawn</code> macro, the tasks will be scheduled to be run on different CPU threads. This can be useful when you are calling a lot of operations that &quot;block&quot; in CUDA, e.g., memory copies to or from unpinned memory. The same result will occur when using a <code>Threads.@threads for ... end</code> block. Generally, though, operations that synchronize GPU execution (including the call to <code>synchronize</code> itself) are implemented in a way that they yield back to the Julia scheduler, to enable concurrent execution without requiring the use of different CPU threads.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>Use of multiple threads with CUDA.jl is a recent addition, and there may still be bugs or performance issues.</p></div></div></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../memory/">« Memory management</a><a class="docs-footer-nextpage" href="../multigpu/">Multiple GPUs »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Wednesday 4 September 2024 18:14">Wednesday 4 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/usage/overview/index.html b/dev/usage/overview/index.html
index 774c630475..448673767b 100644
--- a/dev/usage/overview/index.html
+++ b/dev/usage/overview/index.html
@@ -31,4 +31,4 @@
     @show capability(device)
 end</code></pre><p>If such high-level wrappers are missing, you can always access the underling C API (functions and structures prefixed with <code>cu</code>) without having to ever exit Julia:</p><pre><code class="language-julia hljs">version = Ref{Cint}()
 CUDA.cuDriverGetVersion(version)
-@show version[]</code></pre></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../../installation/troubleshooting/">« Troubleshooting</a><a class="docs-footer-nextpage" href="../workflow/">Workflow »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Monday 26 August 2024 07:38">Monday 26 August 2024</span>. Using Julia version 1.10.4.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+@show version[]</code></pre></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../../installation/troubleshooting/">« Troubleshooting</a><a class="docs-footer-nextpage" href="../workflow/">Workflow »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Wednesday 4 September 2024 18:14">Wednesday 4 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/usage/workflow/index.html b/dev/usage/workflow/index.html
index 74ab937460..0beb275192 100644
--- a/dev/usage/workflow/index.html
+++ b/dev/usage/workflow/index.html
@@ -35,4 +35,4 @@
 2
 
 julia&gt; CUDA.@allowscalar a[1] += 1
-3</code></pre></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../overview/">« Overview</a><a class="docs-footer-nextpage" href="../array/">Array programming »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Monday 26 August 2024 07:38">Monday 26 August 2024</span>. Using Julia version 1.10.4.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+3</code></pre></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../overview/">« Overview</a><a class="docs-footer-nextpage" href="../array/">Array programming »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.4.0 on <span class="colophon-date" title="Wednesday 4 September 2024 18:14">Wednesday 4 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>